#full_data <- readRDS('../data/full_data_20230930.rds')
hourly_full <- readRDS('../data/hourly_full_20230930.rds')
daily_full <- readRDS('../data/daily_full_20230930.rds')

Explore some summary statistics

daily_full <- daily_full %>%
  filter(!is.na(H2S_daily_avg))
hourly_full <- hourly_full %>%
  filter(!is.na(H2S_hourly_avg))
gc()
##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  4850485 259.1    8354324 446.2         NA  7392759 394.9
## Vcells 19275994 147.1   37130200 283.3      24576 31967808 243.9
# full_data <- full_data %>%
#   filter(!is.na(H2S)) %>%
#   select(-starts_with('daily_'), -starts_with('H2S_daily'), 
#          -all_of(c('Ammonia', 'Benzene', 'Black Carbon', 'DST', 'utm_x', 'utm_y',
#                    'county')))
# gc()

Fixed Monitor stats

monitor_names <- c("ElSegundo" = "El Segundo", 
                   "StAnthony" = "St. Anthony",
                   "Manhattan" = "Manhattan",
                   "WestHS" = "West HS",
                   "ElmAve" = "Elm Ave",
                   "NorthHS" = "North HS",
                   "GuenserPark" = "Guenser Park",
                   "Chico" = "213th & Chico",
                   "Judson" = "Judson",
                   "HarborPark" = "Harbor Park",
                   "FirstMethodist" = "First Methodist",
                   "GStreet" = "G Street",
                   "StLuke" = "St. Luke",
                   "Hudson" = "Hudson",
                   "InnerPort" = "Inner Port")

base_monitor_stat <- daily_full %>%
  group_by(Monitor) %>%
  summarise('Start Date' = strftime(min(day), '%Y-%m-%d'),
            'End Date' = strftime(max(day), '%Y-%m-%d'),
            'Closest Refinery' = unique(closest_ref),
            'Distance to Nearest Refinery (m)' = round(unique(dist_ref)),
            'Angle to Refinery' = unique(angle_ref),
            'Distance to Nearest WRP (m)' = round(unique(dist_wrp)),
            'Capacity of Nearest WRP' = unique(closest_wrp_capacity),
            'Angle to WRP' = round(unique(angle_wrp)),
            'Distance to Dominguez Channel (m)' = round(unique(dist_dc)),
            'Elevation' = unique(elevation),
            'Enhanced Vegetation Index' = unique(EVI)) %>%
  mutate(`Closest Refinery` = case_when(`Closest Refinery` == "Phillips 66 (Wilmington)" ~ "Phillips 66",
                                        `Closest Refinery` == "Torrance Refinery" ~ "Torrance",
                                        `Closest Refinery` == "Valero Refinery" ~ "Valero",
                                        `Closest Refinery` == "Marathon (Carson)" ~ "Marathon Carson",
                                        `Closest Refinery` == "Marathon (Wilmington)" ~ "Marathon Wilmington",
                                        .default = `Closest Refinery`)) %>%
  mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
  arrange(factor(Monitor, levels = unname(monitor_names)))

knitr::kable(base_monitor_stat, digits = 2, format = 'html')
Monitor Start Date End Date Closest Refinery Distance to Nearest Refinery (m) Angle to Refinery Distance to Nearest WRP (m) Capacity of Nearest WRP Angle to WRP Distance to Dominguez Channel (m) Elevation Enhanced Vegetation Index
El Segundo 2020-01-01 2020-08-13 Chevron El Segundo 1174 190.87 2810 850 294 6437 60 0.17
St. Anthony 2020-04-23 2023-09-30 Chevron El Segundo 970 188.57 2825 850 298 6543 44 0.17
Manhattan 2020-03-24 2023-09-30 Chevron El Segundo 2341 341.46 5462 850 325 6145 42 0.19
West HS 2020-01-01 2022-04-29 Torrance 3536 85.69 9333 400 123 1547 36 0.15
Elm Ave 2020-01-01 2023-09-30 Torrance 1362 5.34 5966 400 132 3955 32 0.07
North HS 2020-01-01 2022-04-29 Torrance 1779 160.72 8569 400 145 4858 24 0.15
Guenser Park 2020-04-23 2023-09-30 Torrance 2400 220.25 7702 400 159 375 16 0.14
213th & Chico 2021-10-14 2022-01-28 Marathon Carson 2879 145.62 4297 400 213 50 7 0.12
Judson 2020-02-25 2023-09-30 Marathon Carson 2715 112.45 2692 400 213 1481 13 0.14
Harbor Park 2020-01-01 2023-09-30 Phillips 66 1463 183.71 2012 400 6 4262 12 0.60
First Methodist 2020-03-04 2023-09-30 Phillips 66 1124 205.55 2456 400 355 3792 14 0.21
G Street 2021-01-20 2023-09-30 Phillips 66 717 222.51 2940 400 356 3748 8 0.09
St. Luke 2020-02-18 2023-09-30 Marathon Carson 2768 260.38 6910 400 256 1790 10 0.17
Hudson 2020-01-01 2023-09-30 Marathon Wilmington 1378 240.55 5920 400 272 705 8 0.14
Inner Port 2020-04-22 2023-09-30 Valero 2022 260.92 5970 15 228 1937 5 0.04

Since Feb 2022

sincefeb2022_stat <- daily_full %>%
  filter(day > '2022-01-31') %>%
  group_by(Monitor) %>%
  summarise('Daily observations' = n(),
            'Max Daily Max' = max(H2S_daily_max, na.rm=T),
            'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
            'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
            'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
            'Average Wind Speed' = mean(ws_avg, na.rm=T),
            'Average Wind Direction' = as.numeric(mean(circular(wd_avg, 
                                                                units = 'degrees'), 
                                                       na.rm=T)),
            'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
            'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
  mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0, 
                                            `Average Wind Direction`+360, 
                                            `Average Wind Direction`)) %>%
  mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
  arrange(factor(Monitor, levels = unname(monitor_names)))

knitr::kable(sincefeb2022_stat, digits = 2, format = 'html')
Monitor Daily observations Max Daily Max Max Daily Average Avg Daily Max Avg Daily Average Average Wind Speed Average Wind Direction Average daily odor complaints within zipcode Active wells 2km
St. Anthony 608 140.15 3.41 2.45 0.52 7.83 236.36 4.65 1.96
Manhattan 599 6.82 1.40 0.82 0.39 3.40 233.30 0.04 0.00
West HS 88 2.90 0.88 1.09 0.40 6.24 272.30 0.03 3.00
Elm Ave 608 5.10 3.10 1.60 0.89 4.53 243.43 0.04 2.00
North HS 89 4.00 1.61 1.37 0.51 5.36 240.86 0.00 0.00
Guenser Park 546 7.66 2.66 1.76 0.80 4.43 260.55 0.02 0.00
Judson 583 9.75 2.80 1.69 0.54 3.80 272.41 0.53 0.00
Harbor Park 414 9.65 2.48 1.85 0.50 2.89 296.16 0.07 44.30
First Methodist 606 14.72 2.34 2.33 0.71 3.27 269.13 0.11 29.22
G Street 607 39.18 2.72 3.82 0.83 4.96 277.89 0.11 19.36
St. Luke 602 11.62 3.43 2.25 0.74 3.66 291.26 0.23 5.00
Hudson 599 98.17 6.08 2.95 0.99 3.49 39.57 0.23 2.70
Inner Port 582 53.70 5.39 6.31 0.87 5.07 227.02 0.05 83.81

During Disaster (October 2021 - December 2021)

disaster_stat <- daily_full %>%
  filter(year == '2021', month %in% c('10', '11', '12')) %>%
  group_by(Monitor) %>%
  summarise('Daily observations' = n(),
            'Max Daily Max' = max(H2S_daily_max, na.rm=T),
            'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
            'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
            'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
            'Average Wind Speed' = mean(ws_avg, na.rm=T),
            'Average Wind Direction' = as.numeric(mean(circular(wd_avg, 
                                                                units = 'degrees'), 
                                                       na.rm=T)),
            'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
            'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
  mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0, 
                                            `Average Wind Direction`+360, 
                                            `Average Wind Direction`)) %>%
  mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
  arrange(factor(Monitor, levels = unname(monitor_names)))

knitr::kable(disaster_stat, digits = 2, format = 'html')
Monitor Daily observations Max Daily Max Max Daily Average Avg Daily Max Avg Daily Average Average Wind Speed Average Wind Direction Average daily odor complaints within zipcode Active wells 2km
St. Anthony 92 20.21 2.41 3.01 0.69 7.07 343.87 2.96 3.00
Manhattan 92 25.45 4.06 3.48 1.51 2.74 212.03 0.11 0.00
West HS 92 41.50 4.87 3.50 0.74 3.95 239.76 0.53 3.00
Elm Ave 92 63.50 8.15 7.73 1.34 3.77 236.77 0.53 2.00
North HS 92 98.50 7.88 6.93 1.15 3.74 235.24 0.46 0.00
Guenser Park 92 211.67 14.48 14.64 1.63 3.51 276.58 0.46 0.00
213th & Chico 79 13407.18 1639.53 1025.43 128.07 3.55 293.63 17.15 0.00
Judson 92 742.25 69.08 59.79 6.86 2.77 294.11 31.71 0.00
Harbor Park 92 75.93 9.46 8.51 1.20 2.30 306.88 0.49 42.99
First Methodist 92 149.47 14.11 10.92 1.74 2.62 296.60 1.64 30.33
G Street 92 48.67 6.61 9.15 1.75 3.58 334.25 1.64 18.33
St. Luke 92 119.72 12.26 10.91 2.43 3.07 351.66 0.91 3.00
Hudson 92 192.64 19.45 16.72 3.36 2.53 13.51 0.91 2.34
Inner Port 90 136.50 12.21 15.91 3.18 3.47 9.29 0.09 86.34
# Try only the october for the prediction map
disaster_oct_stat <- daily_full %>%
  filter(year == '2021', month == '10') %>%
  group_by(Monitor) %>%
  summarise('Daily observations' = n(),
            'Max Daily Max' = max(H2S_daily_max, na.rm=T),
            'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
            'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
            'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
            'Average Wind Speed' = mean(ws_avg, na.rm=T),
            'Average Wind Direction' = as.numeric(mean(circular(wd_avg, 
                                                                units = 'degrees'), 
                                                       na.rm=T)),
            'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
            'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
  mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0, 
                                            `Average Wind Direction`+360, 
                                            `Average Wind Direction`)) %>%
  mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
  arrange(factor(Monitor, levels = unname(monitor_names)))

knitr::kable(disaster_oct_stat, digits = 2, format = 'html')
Monitor Daily observations Max Daily Max Max Daily Average Avg Daily Max Avg Daily Average Average Wind Speed Average Wind Direction Average daily odor complaints within zipcode Active wells 2km
St. Anthony 31 20.21 2.41 3.89 0.73 7.49 304.48 6.39 3
Manhattan 31 25.45 4.06 5.47 1.77 2.91 225.92 0.29 0
West HS 31 41.50 4.87 7.56 1.06 4.35 231.91 1.42 3
Elm Ave 31 63.50 8.15 18.37 2.37 4.16 238.62 1.42 2
North HS 31 98.50 7.88 16.83 2.02 4.11 236.18 1.32 0
Guenser Park 31 211.67 14.48 37.77 3.12 3.88 263.18 1.32 0
213th & Chico 18 13407.18 1639.53 4337.41 536.21 4.41 260.39 49.44 0
Judson 31 742.25 69.08 167.25 17.18 2.91 290.90 79.10 0
Harbor Park 31 75.93 9.46 20.66 2.63 2.29 301.72 1.23 40
First Methodist 31 149.47 14.11 25.88 3.25 2.67 288.59 4.65 29
G Street 31 47.27 6.61 14.01 2.41 3.65 323.75 4.65 17
St. Luke 31 119.72 12.26 22.38 4.26 3.23 330.94 1.77 3
Hudson 31 192.64 19.45 39.27 5.95 2.76 343.90 1.77 2
Inner Port 31 136.50 12.21 22.60 3.79 3.87 325.23 0.26 87

Normal Period (Jan 2020- May 2023) excluding disaster

normal_stat <- daily_full %>%
  filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
  group_by(Monitor) %>%
  summarise('Daily observations' = n(),
            'Max Daily Max' = max(H2S_daily_max, na.rm=T),
            'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
            'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
            'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
            'Average Wind Speed' = mean(ws_avg, na.rm=T),
            'Average Wind Direction' = as.numeric(mean(circular(wd_avg, 
                                                                units = 'degrees'), 
                                                       na.rm=T)),
            'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
            'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
  mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0, 
                                            `Average Wind Direction`+360, 
                                            `Average Wind Direction`)) %>%
  mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
  arrange(factor(Monitor, levels = unname(monitor_names)))

knitr::kable(normal_stat, digits = 2, format = 'html')
Monitor Daily observations Max Daily Max Max Daily Average Avg Daily Max Avg Daily Average Average Wind Speed Average Wind Direction Average daily odor complaints within zipcode Active wells 2km
El Segundo 223 9.24 3.67 1.70 0.57 7.92 255.36 0.25 2.19
St. Anthony 1160 140.15 3.43 2.64 0.57 7.76 243.55 4.63 1.98
Manhattan 1180 6.82 2.38 1.03 0.53 4.02 241.77 0.06 0.00
West HS 755 18.50 3.01 1.54 0.60 5.40 247.07 0.04 1.91
Elm Ave 1268 11.40 3.20 1.87 0.92 4.53 242.45 0.04 1.86
North HS 758 30.60 2.71 1.91 1.02 4.62 241.06 0.01 0.00
Guenser Park 1073 16.18 2.66 1.50 0.62 4.55 259.26 0.02 0.00
213th & Chico 28 14.27 3.92 4.78 3.12 2.34 312.27 2.75 0.00
Judson 1186 15.78 3.63 1.99 0.72 4.41 266.20 0.43 0.00
Harbor Park 1018 17.42 2.67 1.76 0.47 3.37 283.50 0.07 46.82
First Methodist 1208 20.91 4.18 2.42 0.76 3.81 263.94 0.16 30.09
G Street 883 382.76 18.66 4.13 0.83 4.74 297.94 0.13 19.30
St. Luke 1177 11.62 3.43 2.09 0.69 3.94 270.65 0.16 3.86
Hudson 1266 98.17 6.08 2.87 0.98 3.73 254.80 0.15 3.15
Inner Port 1139 53.70 5.39 5.54 0.84 4.82 229.05 0.05 88.32
# Try only the Oct 2022
normal_oct_stat <- daily_full %>%
  filter(year == '2022' & month == '10') %>%
  group_by(Monitor) %>%
  summarise('Daily observations' = n(),
            'Max Daily Max' = max(H2S_daily_max, na.rm=T),
            'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
            'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
            'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
            'Average Wind Speed' = mean(ws_avg, na.rm=T),
            'Average Wind Direction' = as.numeric(mean(circular(wd_avg, 
                                                                units = 'degrees'), 
                                                       na.rm=T)),
            'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
            'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
  mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0, 
                                            `Average Wind Direction`+360, 
                                            `Average Wind Direction`)) %>%
  mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
  arrange(factor(Monitor, levels = unname(monitor_names)))

knitr::kable(normal_oct_stat, digits = 2, format = 'html')
Monitor Daily observations Max Daily Max Max Daily Average Avg Daily Max Avg Daily Average Average Wind Speed Average Wind Direction Average daily odor complaints within zipcode Active wells 2km
St. Anthony 31 4.24 1.54 2.12 0.98 6.65 259.53 14.13 3
Manhattan 23 1.95 0.89 0.96 0.46 2.68 224.60 0.00 0
Elm Ave 31 3.48 1.59 1.70 1.01 3.45 239.50 0.19 2
Guenser Park 31 6.50 2.02 2.59 1.23 3.14 266.04 0.00 0
Judson 31 2.97 0.63 0.77 0.25 2.59 284.61 1.16 0
Harbor Park 28 7.04 0.90 2.25 0.42 2.24 293.73 0.29 45
First Methodist 31 8.42 2.09 2.60 0.73 2.66 275.98 0.26 30
G Street 31 11.77 1.73 4.07 1.00 3.50 331.32 0.26 20
St. Luke 31 6.40 1.64 2.01 0.51 2.78 8.03 0.65 5
Hudson 29 13.34 2.12 3.18 1.20 2.71 57.46 0.62 3
Inner Port 31 52.63 2.07 6.95 0.80 3.02 161.12 0.00 80

Table 1: Monitor statistics

table1 <- base_monitor_stat %>%
  select(-c(`Angle to Refinery`, `Angle to WRP`, `Capacity of Nearest WRP`)) %>%
  left_join(disaster_stat %>% 
              select(Monitor, `Avg Daily Average`) %>% 
              rename(`Disaster Avg Daily Average` = `Avg Daily Average`), 
            join_by(Monitor)) %>%
  left_join(normal_stat %>% 
              select(Monitor, `Avg Daily Average`, `Average daily odor complaints within zipcode`) %>%
              rename(`Normal Avg Daily Average` = `Avg Daily Average`,
                     `Normal Avg Daily odor complaints` = `Average daily odor complaints within zipcode`), 
            join_by(Monitor)) %>%
  mutate(`Closest Refinery` = paste0(`Closest Refinery`, ' (', round(`Distance to Nearest Refinery (m)`/1000, 1), ')'),
         '#' = 1:n()) %>%
  select(-`Distance to Nearest Refinery (m)`) %>%
  relocate('#', Monitor, `Start Date`, `End Date`, `Closest Refinery`, `Normal Avg Daily Average`, `Disaster Avg Daily Average`, `Normal Avg Daily odor complaints`)

table1_kable <- knitr::kable(table1, format = 'latex', digits = 2)
writeLines(table1_kable, '../figures/table1.tex')

knitr::kable(table1, format = 'html', digits = 2)
# Monitor Start Date End Date Closest Refinery Normal Avg Daily Average Disaster Avg Daily Average Normal Avg Daily odor complaints Distance to Nearest WRP (m) Distance to Dominguez Channel (m) Elevation Enhanced Vegetation Index
1 El Segundo 2020-01-01 2020-08-13 Chevron El Segundo (1.2) 0.57 NA 0.25 2810 6437 60 0.17
2 St. Anthony 2020-04-23 2023-09-30 Chevron El Segundo (1) 0.57 0.69 4.63 2825 6543 44 0.17
3 Manhattan 2020-03-24 2023-09-30 Chevron El Segundo (2.3) 0.53 1.51 0.06 5462 6145 42 0.19
4 West HS 2020-01-01 2022-04-29 Torrance (3.5) 0.60 0.74 0.04 9333 1547 36 0.15
5 Elm Ave 2020-01-01 2023-09-30 Torrance (1.4) 0.92 1.34 0.04 5966 3955 32 0.07
6 North HS 2020-01-01 2022-04-29 Torrance (1.8) 1.02 1.15 0.01 8569 4858 24 0.15
7 Guenser Park 2020-04-23 2023-09-30 Torrance (2.4) 0.62 1.63 0.02 7702 375 16 0.14
8 213th & Chico 2021-10-14 2022-01-28 Marathon Carson (2.9) 3.12 128.07 2.75 4297 50 7 0.12
9 Judson 2020-02-25 2023-09-30 Marathon Carson (2.7) 0.72 6.86 0.43 2692 1481 13 0.14
10 Harbor Park 2020-01-01 2023-09-30 Phillips 66 (1.5) 0.47 1.20 0.07 2012 4262 12 0.60
11 First Methodist 2020-03-04 2023-09-30 Phillips 66 (1.1) 0.76 1.74 0.16 2456 3792 14 0.21
12 G Street 2021-01-20 2023-09-30 Phillips 66 (0.7) 0.83 1.75 0.13 2940 3748 8 0.09
13 St. Luke 2020-02-18 2023-09-30 Marathon Carson (2.8) 0.69 2.43 0.16 6910 1790 10 0.17
14 Hudson 2020-01-01 2023-09-30 Marathon Wilmington (1.4) 0.98 3.36 0.15 5920 705 8 0.14
15 Inner Port 2020-04-22 2023-09-30 Valero (2) 0.84 3.18 0.05 5970 1937 5 0.04

GAM

Feature Selection

Prepare feature and data tables

hourly_responses <- c('H2S_hourly_avg', 'H2S_hourly_max')

# since feb 2022
daily_data_sincefeb2022 <- daily_full %>% filter(day > '2022-01-31')
hourly_data_sincefeb2022 <- hourly_full %>% filter(day > '2022-01-31')

# Disaster
daily_data_dis <- daily_full %>% filter(year == '2021', month %in% c('10', '11', '12'))
hourly_data_dis <- hourly_full %>% filter(year == '2021', month %in% c('10', '11', '12'))

# Exclude disaster stepwise
daily_data_excl_dis <- daily_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12')))
hourly_data_excl_dis <- hourly_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12')))

# Everything w. disaster indicator
daily_data_dis_ind <- daily_full %>% 
  mutate(disaster = 
           if_else(year == '2021', month %in% c('10', '11', '12'), 1, 0))
hourly_data_dis_ind <- hourly_full %>% 
  mutate(disaster = 
           if_else(year == '2021', month %in% c('10', '11', '12'), 1, 0))

Select smooth

daily_responses <- c('H2S_daily_avg', 'log(H2S_daily_avg)', 
                     'H2S_daily_max', 'log(H2S_daily_max)')
hourly_responses <- c('H2S_hourly_avg', 'log(H2S_hourly_avg)',
                      'H2S_hourly_max', 'log(H2S_hourly_max)')
dateranges <- c('sincefeb2022', 'dis', 'excl_dis', 'dis_ind', 'full')
smooth <- c("s(as.numeric(month),bs='cc')", 
            "s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10)",
            "te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), k=c(10,10),d=c(2,1),bs=c('tp','cc'))")
smooth_tibble <- tibble(features = c(list(c(smooth[1])), 
                                     list(c(smooth[2])),
                                     list(c(smooth[3])),
                                     list(c(smooth[1:2])),
                                     list(c(smooth[c(1, 3)])),
                                     list(c(smooth[2:3])),
                                     list(c(smooth[1:3]))),
                        disaster_applicable = c(0, 1, 1, 0, 0, 1, 0))
# smooth_compare <- crossing(response = c(daily_responses, hourly_responses), 
#                            daterange = dateranges) %>%
#   cross_join(smooth_tibble) %>%
#   mutate(GCV = NA) %>%
#   filter(!(daterange == 'dis' & disaster_applicable == 0)) %>%
#   select(-disaster_applicable)
# 
# for (i in 1:nrow(smooth_compare)) {
#   features <- unname(unlist(smooth_compare[i,'features']))
#   formula_feature_str <- paste(features, collapse = ' + ')
#   formula_str <- paste(smooth_compare[[i, 'response']], formula_feature_str, sep = ' ~ ')
#   formula <- as.formula(formula_str)
# 
#   if (smooth_compare[[i, 'response']] %in% hourly_responses &
#       smooth_compare[[i, 'daterange']] == 'full') {
#     data <- hourly_full
#   } else if (smooth_compare[[i, 'response']] %in% hourly_responses){
#     data <- get(paste0('hourly_data_', smooth_compare[[i, 'daterange']]))
#   } else if (smooth_compare[[i, 'daterange']] == 'full') {
#     data <- daily_full
#   } else {
#     data <- get(paste0('daily_data_', smooth_compare[[i, 'daterange']]))
#   }
# 
#   summary <- summary(gam(formula, data = data, method = 'GCV.Cp', select = TRUE))
#   GCV_new <- summary$sp.criterion[[1]]
#   smooth_compare[i, 'GCV'] <- GCV_new
#   print(str_glue('Completed {i} iterations'))
#   gc()
# }
# 
# smooth_compare <- smooth_compare %>%
#   group_by(response, daterange) %>%
#   mutate(best = if_else(GCV == min(GCV), 1, 0)) %>%
#   mutate(rounded_GCV = round(GCV, 2)) %>%
#   rowwise() %>%
#   mutate(month_smooth = if_else(smooth[1] %in% unlist(features), 1, 0),
#          coord_smooth = if_else(smooth[2] %in% unlist(features), 1, 0),
#          coord_day_3D_smooth = if_else(smooth[3] %in% unlist(features), 1, 0)) %>%
#   ungroup()
# saveRDS(smooth_compare, 'smooth_compare.rds')

smooth_compare <- readRDS('smooth_compare.rds')
# get best smooth models for diff response and daterange
best_smooth <- smooth_compare %>%
  group_by(response, daterange) %>%
  filter(GCV == min(GCV)) %>%
  ungroup() %>%
  select(response, daterange, GCV, features, month_smooth, coord_smooth, coord_day_3D_smooth)
best_smooth
  • In general, having the coordinate smooth on top of the month and 3D smooth will not harm the model (except for daily max models since feb 2022).
  • However, it does not improve it by much either…
  • We will keep this in.

Fit smooth and get Residuals

response_names <- c('da', 'log_da', 'dm', 'log_dm', 'ha', 'log_ha', 'hm', 'log_hm')

# smooth_models <- tibble(name = response_names,
#                         response = c(daily_responses, hourly_responses)) %>%
#   crossing(tibble(daterange = dateranges)) %>%
#   mutate(name = paste(name, daterange, sep = '_'))


smooth_predictors <-
  c("s(as.numeric(month),bs='cc')",
    "s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10)",
    "te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), k=c(10,10),d=c(2,1),bs=c('tp','cc'))")

# smooth_models$residuals <- rep(list(c()), nrow(smooth_models))
# 
# for (i in 1:nrow(smooth_models)) {
#   # first, get residuals of smooth on response
#   if (smooth_models$daterange[i] == 'dis') {
#     formula_feature_str <- paste(smooth_predictors[c(2, 3)], collapse = ' + ')
#   } else {
#     formula_feature_str <- paste(smooth_predictors, collapse = ' + ')
#   }
# 
#   if (smooth_models[[i, 'response']] %in% hourly_responses &
#       smooth_models[[i, 'daterange']] == 'full') {
#     data <- hourly_full
#   } else if (smooth_models[[i, 'response']] %in% hourly_responses) {
#     data <- get(paste0('hourly_data_', smooth_models[[i, 'daterange']]))
#   } else if (smooth_models[[i, 'daterange']] == 'full') {
#     data <- daily_full
#   } else {
#     data <- get(paste0('daily_data_', smooth_models[[i, 'daterange']]))
#   }
# 
#   formula_str <- paste(smooth_models[[i, 'response']], formula_feature_str, sep = ' ~ ')
#   formula <- as.formula(formula_str)
#   residuals <- gam(formula, data = data, method = 'GCV.Cp')$residuals
#   smooth_models$residuals[i] <- list(residuals)
# }
# saveRDS(smooth_models, 'smooth_models.rds')

smooth_models <- readRDS('smooth_models.rds')

Fit residuals to select linear features

disaster_predictors <- c('month', 'weekday', 'wd_avg', 'ws_avg', 
                       'I(1/dist_wrp^2)', 'I(1/dist_ref^2)', 'I(1/dist_dc^2)',
                       'monthly_oil_2km', 'monthly_gas_2km', 'active_2km', 
                       'inactive_2km', 'elevation', 'EVI', 
                       'num_odor_complaints', 'closest_wrp_capacity')

everything_predictors <- c('year', 'weekday', 'wd_avg', 'ws_avg', 
                       'I(1/dist_wrp^2)', 'I(1/dist_ref^2)', 'I(1/dist_dc^2)',
                       'monthly_oil_2km', 'monthly_gas_2km', 'active_2km', 
                       'inactive_2km', 'elevation', 'EVI', 
                       'num_odor_complaints', 'closest_wrp_capacity')

daily_predictors <- c('daily_downwind_ref', 'daily_downwind_wrp', 'daily_temp',
                      'daily_hum', 'daily_precip')

hourly_predictors <- c('hourly_downwind_ref', 'hourly_downwind_wrp', 'hourly_temp',
                      'hourly_hum', 'hourly_precip')

disaster_linear_pred_str <- paste(disaster_predictors, collapse = ' + ')
everything_linear_pred_str <- paste(everything_predictors, collapse = ' + ')
daily_linear_pred_str <- paste(daily_predictors, collapse = ' + ')
hourly_linear_pred_str <- paste(hourly_predictors, collapse = ' + ')

# for (i in 1:nrow(smooth_models)) {
#   if (smooth_models[[i, 'daterange']] == 'dis_ind' &
#       smooth_models[[i, 'response']] %in% hourly_responses) {
#     formula_str <- paste(everything_linear_pred_str,
#                          hourly_linear_pred_str,
#                          'disaster', sep = ' + ')
#     print(paste0(smooth_models[[i, 'daterange']], ': everything hourly + ind'))
#   } else if (smooth_models[[i, 'daterange']] == 'dis' &
#              smooth_models[[i, 'response']] %in% hourly_responses) {
#     formula_str <- paste(disaster_linear_pred_str,
#                          hourly_linear_pred_str, sep = ' + ')
#     print(paste0(smooth_models[[i, 'daterange']], ': disaster hourly'))
#   } else if (smooth_models[[i, 'response']] %in% hourly_responses) {
#     formula_str <- paste(everything_linear_pred_str,
#                          hourly_linear_pred_str, sep = ' + ')
#     print(paste0(smooth_models[[i, 'daterange']], ': everything hourly'))
#   } else if (smooth_models[[i, 'daterange']] == 'dis_ind') {
#     formula_str <- paste(everything_linear_pred_str,
#                          daily_linear_pred_str,
#                          'disaster', sep = ' + ')
#     print(paste0(smooth_models[[i, 'daterange']], ': everything daily + ind'))
#   } else if (smooth_models[[i, 'daterange']] == 'dis') {
#     formula_str <- paste(disaster_linear_pred_str,
#                          daily_linear_pred_str, sep = ' + ')
#     print(paste0(smooth_models[[i, 'daterange']], ': disaster daily'))
#   } else {
#     formula_str <- paste(everything_linear_pred_str,
#                          daily_linear_pred_str, sep = ' + ')
#     print(paste0(smooth_models[[i, 'daterange']], ': everything daily'))
#   }
#   formula_str <- paste('residuals', formula_str, sep = ' ~ ')
#   formula <- as.formula(formula_str)
# 
#   if (smooth_models[[i, 'response']] %in% hourly_responses &
#       smooth_models[[i, 'daterange']] == 'full') {
#     data <- hourly_full
#   } else if (smooth_models[[i, 'response']] %in% hourly_responses){
#     data <- get(paste0('hourly_data_', smooth_models[[i, 'daterange']]))
#   } else if (smooth_models[[i, 'daterange']] == 'full') {
#     data <- daily_full
#   } else {
#     data <- get(paste0('daily_data_', smooth_models[[i, 'daterange']]))
#   }
# 
#   data$residuals <- unlist(smooth_models$residuals[i])
#   regsubsets <- regsubsets(formula, data, nvmax = Inf)
#   assign(paste0(smooth_models$name[i], '_regsubsets'), regsubsets)
#   print(str_glue('Completed {i} rows'))
# }
# 
# for (i in 1:nrow(smooth_models)) {
#   saveRDS(get(paste0(smooth_models$name[i], '_regsubsets')),
#           paste0('regsubsets/', smooth_models$name[i], '_regsubsets.rds'))
# }

# read regsubsets
for (i in 1:nrow(smooth_models)) {
  assign(paste0(smooth_models$name[i], '_regsubsets'),
         readRDS(paste0('regsubsets/', smooth_models$name[i], '_regsubsets.rds')))
}

Get best model size for each model

# best_model_sizes <- smooth_models %>%
#   group_by(name, response, daterange) %>%
#   summarise(mean_smooth_res = mean(unlist(residuals)),
#             var_smooth_res = round(var(unlist(residuals))), 4) %>%
#   select(any_of(c('name', 'response', 'daterange',
#                   'mean_smooth_res', 'var_smooth_res'))) %>%
#   mutate(Adj.R2 = NA,
#          best_R2 = NA,
#          CP = NA,
#          best_CP = NA,
#          BIC = NA,
#          best_BIC = NA,
#          linear_features = NA)
# for (i in 1:nrow(smooth_models)) {
#   regsubset <- summary(get(paste0(smooth_models$name[i], '_regsubsets')))
#   best_sizes <- tibble(Adj.R2 = which.max(regsubset$adjr2),
#                        best_R2 = regsubset$adjr2[which.max(regsubset$adjr2)],
#                        CP = which.min(regsubset$cp),
#                        best_CP = regsubset$cp[which.max(regsubset$cp)],
#                        BIC = which.min(regsubset$bic),
#                        best_BIC = regsubset$bic[which.max(regsubset$bic)],)
#   best_features <- tibble(linear_features = list(setdiff(names(regsubset$which[best_sizes$CP, ]
#                                                    [unlist(regsubset$which[best_sizes$CP, ])]), '(Intercept)')))
#   best_model_sizes[i,] <- bind_cols(best_model_sizes[i, 1:5], best_sizes, best_features)
# }
# 
# best_models <- best_model_sizes %>%
#   rowwise() %>%
#   mutate(smooth_features = if_else(daterange == 'dis', list(smooth_predictors[2:3]),
#                                    list(smooth_predictors))) %>%
#   mutate(full_features = list(c(unlist(smooth_features), unlist(linear_features)))) %>%
#   ungroup()
# 
# saveRDS(best_models, 'best_gam_models.rds')

best_gam_models <- readRDS('best_gam_models.rds')

Final models

# write function that takes in response, predictors, data and returns gam model 
# different from stepwise function, this has select = FALSE
get_feature_vector <- function(response, daterange) {
  feature_vec <-  best_gam_models %>%
      filter(response == .env$response & daterange == .env$daterange) %>%
      pull(full_features) %>%
      unlist()
  feature_vec <- str_replace_all(feature_vec, 'month\\d+', 'month')
  feature_vec <- str_replace_all(feature_vec, 'year\\d+', 'year')
  feature_vec <- str_replace_all(feature_vec, 'weekday\\D+', 'weekday')
  feature_vec <- unique(feature_vec)
  return(feature_vec)
}

get_data <- function(response, daterange) {
  if (response %in% hourly_responses &
      daterange == 'full') {
    data <- hourly_full
  } else if (response %in% hourly_responses){
    data <- get(paste0('hourly_data_', daterange))
  } else if (daterange == 'full') {
    data <- daily_full
  } else {
    data <- get(paste0('daily_data_', daterange))
  }
  return(data)
}

get_gam_model <- function(response, daterange) {
  predictors <- get_feature_vector(response, daterange)
  formula_feature_str <- paste(predictors, collapse = ' + ')
  formula_str <- paste(response, formula_feature_str, sep = ' ~ ')
  formula <- as.formula(formula_str)

  data <- get_data(response, daterange)
  gam_model <- gam(formula, data = data, method = 'GCV.Cp')
  return(gam_model)
}

# for (i in 1:nrow(best_gam_models)) {
#   model <- get_gam_model(best_gam_models$response[i],
#                          best_gam_models$daterange[i])
#   assign(paste0(best_gam_models$name[i], '_', best_gam_models$daterange[i], '_gam'), model)
#   saveRDS(model, paste0('gam_models/', paste0(best_gam_models$name[i], '_gam.rds')))
# }
# # compare empty init vs full init and find best ones
# final_model_features_table <- model_features_table %>%
#   group_by(stat, date) %>%
#   filter(GCV == min(GCV))
# 
# final_model_features_table <- final_model_features_table %>%
#   select(-c(init, model_name)) %>%
#   distinct()
for (i in 1:nrow(best_gam_models)) {
  assign(paste0(best_gam_models$name[i], '_gam'), 
         readRDS(paste0('gam_models/', best_gam_models$name[i], '_gam.rds')))
}

Daily Average

Since February 2022

# Since feb 2022
summary(da_sincefeb2022_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_gas_2km + 
##     elevation + EVI + num_odor_complaints + daily_downwind_ref + 
##     daily_temp + daily_hum
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.360e+00  1.377e-01  17.134  < 2e-16 ***
## year2023            -1.502e-01  4.090e-02  -3.671 0.000243 ***
## weekdayMon           8.501e-02  1.464e-02   5.808 6.61e-09 ***
## weekdayTue           1.492e-01  1.461e-02  10.210  < 2e-16 ***
## weekdayWed           1.600e-01  1.465e-02  10.925  < 2e-16 ***
## weekdayThu           1.112e-01  1.466e-02   7.588 3.71e-14 ***
## weekdayFri           1.301e-01  1.462e-02   8.901  < 2e-16 ***
## weekdaySat           6.965e-02  1.461e-02   4.767 1.91e-06 ***
## wd_avg               2.874e-04  5.319e-05   5.403 6.80e-08 ***
## ws_avg              -6.768e-02  2.767e-03 -24.464  < 2e-16 ***
## I(1/dist_wrp^2)      6.042e-07  3.582e-07   1.686 0.091755 .  
## I(1/dist_ref^2)      1.458e-05  4.195e-06   3.475 0.000514 ***
## I(1/dist_dc^2)      -2.852e-04  1.095e-04  -2.605 0.009221 ** 
## monthly_gas_2km      4.023e-05  9.376e-06   4.291 1.81e-05 ***
## elevation           -3.687e-02  4.511e-03  -8.172 3.63e-16 ***
## EVI                 -1.343e+00  6.496e-02 -20.679  < 2e-16 ***
## num_odor_complaints  8.478e-03  1.877e-03   4.517 6.37e-06 ***
## daily_downwind_ref  -5.009e-03  1.620e-02  -0.309 0.757170    
## daily_temp           2.439e-03  1.418e-03   1.720 0.085482 .  
## daily_hum           -9.993e-03  3.923e-04 -25.473  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     6.901  8.000  5.193
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.943  8.997 38.764
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.859 80.000 38.890
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 114/117
## R-sq.(adj) =  0.618   Deviance explained = 62.4%
## GCV = 0.10034  Scale est. = 0.098606  n = 6531

Disaster Only

# Disaster only
summary(da_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", 
##     k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##     k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + weekday + 
##     wd_avg + ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + inactive_2km + num_odor_complaints + daily_hum
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.242e+01  1.240e+01   2.615 0.009040 ** 
## weekdayMon          -6.532e+00  7.104e+00  -0.920 0.357985    
## weekdayTue          -1.077e+01  7.280e+00  -1.479 0.139444    
## weekdayWed           4.488e-01  7.088e+00   0.063 0.949527    
## weekdayThu          -3.571e-01  7.076e+00  -0.050 0.959766    
## weekdayFri           1.919e+00  6.917e+00   0.277 0.781522    
## weekdaySat           1.194e+00  7.040e+00   0.170 0.865381    
## wd_avg              -6.137e-02  2.086e-02  -2.941 0.003332 ** 
## ws_avg               6.722e+00  1.791e+00   3.753 0.000183 ***
## I(1/dist_wrp^2)     -1.034e-04  5.137e-05  -2.012 0.044470 *  
## I(1/dist_ref^2)      1.197e-03  6.008e-04   1.993 0.046481 *  
## I(1/dist_dc^2)       3.169e-01  1.510e-01   2.098 0.036104 *  
## monthly_oil_2km     -5.058e-03  1.771e-03  -2.857 0.004358 ** 
## inactive_2km         4.246e+00  1.757e+00   2.417 0.015818 *  
## num_odor_complaints -1.072e+00  1.618e-01  -6.624  5.3e-11 ***
## daily_hum           -2.316e-01  1.185e-01  -1.954 0.050880 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.313  8.838 3.189
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 67.903 80.000 8.117
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                    9e-04 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 102/105
## R-sq.(adj) =  0.417   Deviance explained = 45.7%
## GCV = 4760.8  Scale est. = 4427.1    n = 1273

Exclude Disaster

# Exclude disaster
summary(da_excl_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + ws_avg + I(1/dist_wrp^2) + 
##     I(1/dist_dc^2) + monthly_oil_2km + active_2km + inactive_2km + 
##     elevation + EVI + num_odor_complaints + daily_downwind_ref + 
##     daily_temp + daily_hum + daily_precip
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.336e+00  1.149e-01  11.632  < 2e-16 ***
## year2021             1.601e-02  4.026e-02   0.398  0.69090    
## year2022             4.588e-01  6.379e-02   7.193 6.67e-13 ***
## year2023             4.155e-01  6.146e-02   6.759 1.44e-11 ***
## weekdayMon           9.196e-02  1.360e-02   6.764 1.40e-11 ***
## weekdayTue           1.457e-01  1.355e-02  10.757  < 2e-16 ***
## weekdayWed           1.716e-01  1.354e-02  12.671  < 2e-16 ***
## weekdayThu           1.524e-01  1.355e-02  11.253  < 2e-16 ***
## weekdayFri           1.479e-01  1.355e-02  10.916  < 2e-16 ***
## weekdaySat           7.646e-02  1.356e-02   5.639 1.74e-08 ***
## ws_avg              -2.865e-02  1.914e-03 -14.971  < 2e-16 ***
## I(1/dist_wrp^2)      6.345e-07  3.319e-07   1.912  0.05590 .  
## I(1/dist_dc^2)       4.828e-04  2.112e-04   2.286  0.02228 *  
## monthly_oil_2km      4.078e-06  4.286e-06   0.952  0.34136    
## active_2km           1.040e-02  2.562e-03   4.058 4.97e-05 ***
## inactive_2km        -8.128e-03  5.917e-03  -1.374  0.16957    
## elevation           -7.787e-03  2.782e-03  -2.798  0.00514 ** 
## EVI                 -1.565e+00  9.635e-02 -16.245  < 2e-16 ***
## num_odor_complaints  4.772e-03  1.044e-03   4.569 4.94e-06 ***
## daily_downwind_ref  -3.253e-02  1.420e-02  -2.290  0.02203 *  
## daily_temp           2.445e-03  1.226e-03   1.994  0.04613 *  
## daily_hum           -9.020e-03  3.402e-04 -26.510  < 2e-16 ***
## daily_precip        -6.312e-02  2.599e-02  -2.428  0.01518 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     7.909      8 107.91
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9  42.43
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.630     80  50.40
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 118/120
## R-sq.(adj) =  0.459   Deviance explained = 46.3%
## GCV = 0.18824  Scale est. = 0.1867    n = 14322

Everything w Disaster Indicator

# Disaster indicator
summary(da_dis_ind_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + wd_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     daily_downwind_ref + daily_downwind_wrp + daily_hum + disaster
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.153e+02  1.682e+01  -6.854 7.48e-12 ***
## year2021              1.107e+01  2.264e+00   4.891 1.02e-06 ***
## year2022              1.349e+01  2.925e+00   4.612 4.02e-06 ***
## year2023              9.270e+00  2.703e+00   3.429 0.000606 ***
## wd_avg               -1.067e-02  2.383e-03  -4.476 7.65e-06 ***
## I(1/dist_wrp^2)      -2.361e-05  5.243e-06  -4.503 6.74e-06 ***
## I(1/dist_ref^2)      -7.832e-04  2.288e-04  -3.423 0.000622 ***
## I(1/dist_dc^2)        1.038e-01  1.910e-02   5.436 5.54e-08 ***
## monthly_oil_2km       2.611e-04  2.487e-04   1.050 0.293757    
## active_2km            7.209e-01  1.122e-01   6.427 1.34e-10 ***
## elevation            -6.395e-01  1.300e-01  -4.921 8.71e-07 ***
## EVI                  -3.477e+01  5.784e+00  -6.013 1.87e-09 ***
## num_odor_complaints   6.429e-01  3.073e-02  20.920  < 2e-16 ***
## closest_wrp_capacity  2.489e-01  3.446e-02   7.222 5.38e-13 ***
## daily_downwind_ref   -3.257e+00  7.296e-01  -4.464 8.09e-06 ***
## daily_downwind_wrp    1.003e+00  7.885e-01   1.272 0.203454    
## daily_hum            -6.404e-02  1.501e-02  -4.268 1.99e-05 ***
## disaster              1.334e+01  1.874e+00   7.116 1.16e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.223  8.000 3.920
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.918  8.978 7.176
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 73.982 80.000 8.124
##                                                          p-value    
## s(as.numeric(month))                                    3.17e-05 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 112/115
## R-sq.(adj) =  0.114   Deviance explained =   12%
## GCV = 558.69  Scale est. = 554.93    n = 15595

Everything w.o Disaster Indicator

# Everything
summary(da_full_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + wd_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     daily_downwind_ref + daily_downwind_wrp + daily_hum
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -9.133e+01  1.643e+01  -5.560 2.74e-08 ***
## year2021              1.432e+00  1.818e+00   0.788   0.4309    
## year2022              1.369e+00  2.382e+00   0.575   0.5655    
## year2023              1.471e+00  2.473e+00   0.595   0.5521    
## wd_avg               -1.045e-02  2.387e-03  -4.377 1.21e-05 ***
## I(1/dist_wrp^2)      -2.190e-05  5.392e-06  -4.062 4.89e-05 ***
## I(1/dist_ref^2)      -5.814e-04  2.290e-04  -2.539   0.0111 *  
## I(1/dist_dc^2)        8.553e-02  1.867e-02   4.582 4.63e-06 ***
## monthly_oil_2km       3.076e-04  2.482e-04   1.239   0.2154    
## active_2km            6.364e-01  1.117e-01   5.699 1.23e-08 ***
## elevation            -6.377e-01  1.301e-01  -4.901 9.63e-07 ***
## EVI                  -3.090e+01  5.760e+00  -5.364 8.26e-08 ***
## num_odor_complaints   6.680e-01  3.059e-02  21.836  < 2e-16 ***
## closest_wrp_capacity  2.364e-01  3.428e-02   6.896 5.56e-12 ***
## daily_downwind_ref   -3.341e+00  7.307e-01  -4.572 4.86e-06 ***
## daily_downwind_wrp    9.727e-01  7.897e-01   1.232   0.2181    
## daily_hum            -6.601e-02  1.502e-02  -4.394 1.12e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.189  8.000 4.364
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.913  8.977 6.952
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 73.797 80.000 7.824
##                                                          p-value    
## s(as.numeric(month))                                    3.41e-06 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 111/114
## R-sq.(adj) =  0.111   Deviance explained = 11.7%
## GCV = 560.48  Scale est. = 556.74    n = 15595

Log Daily Average

Since February 2022

# Since feb 2022
summary(log_da_sincefeb2022_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + inactive_2km + 
##     elevation + EVI + num_odor_complaints + daily_temp + daily_hum
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.432e+00  1.981e-01   7.232 5.30e-13 ***
## year2023            -8.379e-02  5.080e-02  -1.649 0.099161 .  
## weekdayMon           1.229e-01  1.816e-02   6.767 1.43e-11 ***
## weekdayTue           2.075e-01  1.813e-02  11.445  < 2e-16 ***
## weekdayWed           2.046e-01  1.817e-02  11.258  < 2e-16 ***
## weekdayThu           1.491e-01  1.819e-02   8.196 2.98e-16 ***
## weekdayFri           1.690e-01  1.814e-02   9.316  < 2e-16 ***
## weekdaySat           1.111e-01  1.813e-02   6.129 9.36e-10 ***
## wd_avg               3.863e-04  6.599e-05   5.853 5.05e-09 ***
## ws_avg              -9.982e-02  3.432e-03 -29.085  < 2e-16 ***
## I(1/dist_wrp^2)      2.520e-07  4.049e-07   0.622 0.533743    
## I(1/dist_ref^2)     -2.302e-07  4.747e-06  -0.048 0.961325    
## I(1/dist_dc^2)      -2.180e-04  6.788e-05  -3.212 0.001326 ** 
## inactive_2km         1.669e-02  4.984e-03   3.350 0.000814 ***
## elevation           -3.943e-02  7.751e-03  -5.087 3.74e-07 ***
## EVI                 -1.905e+00  1.451e-01 -13.124  < 2e-16 ***
## num_odor_complaints  1.194e-02  2.328e-03   5.130 2.99e-07 ***
## daily_temp           2.513e-03  1.762e-03   1.426 0.153804    
## daily_hum           -1.297e-02  4.847e-04 -26.769  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     7.093   8.00  3.389
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   7.595   8.23 53.608
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.875  80.00 63.580
##                                                          p-value    
## s(as.numeric(month))                                    0.000191 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 113/116
## R-sq.(adj) =  0.698   Deviance explained = 70.3%
## GCV = 0.15447  Scale est. = 0.15186   n = 6531
get_gam_result <- function(model_obj, model_name){
  p <- summary(model_obj)$p.pv
  coef <- tibble(Term = names(summary(model_obj)$p.coeff), x = summary(model_obj)$p.coeff) %>%
              mutate(x = paste0(formatC(x, format = 'e', digits = 2), 
                              case_when(p < 0.001 ~ '***', 
                                        0.001 <= p & p < 0.01 ~ '**', 
                                        0.01 <= p & p < 0.05 ~ '*', 
                                        0.05 <= p & p < 0.1 ~ '.', 
                                        TRUE ~ '')))  %>%
              rename({{model_name}} := x)
  coef <- rbind(coef, tibble(Term = 'Adj.R-sq', {{model_name}} := 
                             formatC(summary(model_obj)$r.sq, format = 'e', 
                             digits = 2)))
  return(coef)
}
sincefeb2022_model_stat <- get_gam_result(log_da_sincefeb2022_gam, 'Daily Average H2S Since Feb 2022') 

sincefeb2022_model_stat_latex <- knitr::kable(sincefeb2022_model_stat, format = 'latex', digits = 2)
writeLines(sincefeb2022_model_stat_latex, '../figures/sincefeb2022_log_model.tex')

knitr::kable(sincefeb2022_model_stat, format = 'html', digits = 2)
Term Daily Average H2S Since Feb 2022
(Intercept) 1.43e+00***
year2023 -8.38e-02.
weekdayMon 1.23e-01***
weekdayTue 2.08e-01***
weekdayWed 2.05e-01***
weekdayThu 1.49e-01***
weekdayFri 1.69e-01***
weekdaySat 1.11e-01***
wd_avg 3.86e-04***
ws_avg -9.98e-02***
I(1/dist_wrp^2) 2.52e-07
I(1/dist_ref^2) -2.30e-07
I(1/dist_dc^2) -2.18e-04**
inactive_2km 1.67e-02***
elevation -3.94e-02***
EVI -1.90e+00***
num_odor_complaints 1.19e-02***
daily_temp 2.51e-03
daily_hum -1.30e-02***
Adj.R-sq 6.98e-01

Disaster Only

# Disaster only
summary(log_da_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_avg) ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp", 
##         "cc")) + month + weekday + ws_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km + 
##     inactive_2km + elevation + num_odor_complaints + closest_wrp_capacity + 
##     daily_downwind_ref + daily_temp + daily_hum
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.399e+01  5.713e+00  -2.449 0.014450 *  
## month11               5.086e-01  1.310e-01   3.882 0.000109 ***
## month12               4.853e-01  1.452e-01   3.343 0.000854 ***
## weekdayMon            4.504e-02  6.325e-02   0.712 0.476553    
## weekdayTue           -1.303e-01  6.478e-02  -2.012 0.044445 *  
## weekdayWed            1.073e-01  6.281e-02   1.708 0.087868 .  
## weekdayThu            7.894e-02  6.303e-02   1.252 0.210689    
## weekdayFri           -9.842e-02  6.144e-02  -1.602 0.109460    
## weekdaySat           -1.387e-01  6.204e-02  -2.236 0.025544 *  
## ws_avg               -1.428e-01  1.628e-02  -8.766  < 2e-16 ***
## I(1/dist_wrp^2)      -1.877e-05  7.598e-06  -2.470 0.013655 *  
## I(1/dist_ref^2)      -1.698e-04  7.029e-05  -2.415 0.015875 *  
## I(1/dist_dc^2)        1.556e-01  6.090e-02   2.556 0.010722 *  
## monthly_oil_2km       3.410e-04  1.225e-04   2.783 0.005466 ** 
## active_2km           -4.484e-01  4.093e-02 -10.956  < 2e-16 ***
## inactive_2km          9.736e-01  1.340e-01   7.263 6.83e-13 ***
## elevation            -2.777e-01  3.584e-02  -7.748 1.99e-14 ***
## num_odor_complaints   8.276e-03  1.340e-03   6.177 8.98e-10 ***
## closest_wrp_capacity  4.577e-02  1.095e-02   4.182 3.11e-05 ***
## daily_downwind_ref   -8.101e-02  5.618e-02  -1.442 0.149606    
## daily_temp            9.549e-03  6.866e-03   1.391 0.164564    
## daily_hum            -8.024e-03  1.338e-03  -5.997 2.67e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                           edf Ref.df      F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.00      9 25.693
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 58.55     80  9.163
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 108/111
## R-sq.(adj) =  0.746   Deviance explained = 76.3%
## GCV = 0.36938  Scale est. = 0.34427   n = 1273

Exclude Disaster

# Exclude disaster
summary(log_da_excl_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + ws_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km + 
##     inactive_2km + elevation + EVI + num_odor_complaints + daily_downwind_ref + 
##     daily_temp + daily_hum + daily_precip
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -3.777e-04  1.276e-01  -0.003 0.997638    
## year2021             5.695e-02  4.471e-02   1.274 0.202721    
## year2022             2.862e-01  7.084e-02   4.040 5.36e-05 ***
## year2023             3.888e-01  6.828e-02   5.694 1.27e-08 ***
## weekdayMon           1.206e-01  1.509e-02   7.990 1.46e-15 ***
## weekdayTue           1.832e-01  1.504e-02  12.182  < 2e-16 ***
## weekdayWed           2.088e-01  1.503e-02  13.889  < 2e-16 ***
## weekdayThu           1.913e-01  1.504e-02  12.717  < 2e-16 ***
## weekdayFri           1.901e-01  1.505e-02  12.636  < 2e-16 ***
## weekdaySat           1.047e-01  1.505e-02   6.953 3.73e-12 ***
## ws_avg              -3.991e-02  2.125e-03 -18.782  < 2e-16 ***
## I(1/dist_wrp^2)      1.817e-06  3.693e-07   4.918 8.83e-07 ***
## I(1/dist_ref^2)     -7.611e-05  1.670e-05  -4.558 5.20e-06 ***
## I(1/dist_dc^2)      -3.279e-04  3.098e-04  -1.058 0.289931    
## monthly_oil_2km     -2.430e-06  4.769e-06  -0.510 0.610338    
## active_2km           2.162e-02  2.857e-03   7.565 4.11e-14 ***
## inactive_2km        -9.643e-03  6.648e-03  -1.451 0.146930    
## elevation           -2.695e-03  3.091e-03  -0.872 0.383265    
## EVI                 -2.515e+00  1.072e-01 -23.470  < 2e-16 ***
## num_odor_complaints  4.427e-03  1.160e-03   3.818 0.000135 ***
## daily_downwind_ref   5.065e-03  1.577e-02   0.321 0.748096    
## daily_temp           5.239e-03  1.361e-03   3.850 0.000119 ***
## daily_hum           -1.109e-02  3.777e-04 -29.360  < 2e-16 ***
## daily_precip        -1.224e-01  2.886e-02  -4.242 2.23e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     7.904      8 102.29
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 104.05
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.878     80  86.72
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 118/121
## R-sq.(adj) =  0.567   Deviance explained =   57%
## GCV = 0.23204  Scale est. = 0.23013   n = 14322

Everything w Disaster Indicator

# Disaster indicator
summary(log_da_dis_ind_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + ws_avg + I(1/dist_dc^2) + 
##     monthly_oil_2km + active_2km + inactive_2km + elevation + 
##     EVI + num_odor_complaints + daily_downwind_ref + daily_temp + 
##     daily_hum + daily_precip + disaster
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -3.736e-01  1.489e-01  -2.509 0.012120 *  
## year2021             9.739e-01  5.353e-02  18.193  < 2e-16 ***
## year2022             1.040e+00  6.966e-02  14.930  < 2e-16 ***
## year2023             9.021e-01  6.496e-02  13.888  < 2e-16 ***
## weekdayMon           1.070e-01  1.628e-02   6.575 5.01e-11 ***
## weekdayTue           1.477e-01  1.623e-02   9.100  < 2e-16 ***
## weekdayWed           1.957e-01  1.621e-02  12.073  < 2e-16 ***
## weekdayThu           1.767e-01  1.623e-02  10.886  < 2e-16 ***
## weekdayFri           1.573e-01  1.620e-02   9.711  < 2e-16 ***
## weekdaySat           7.822e-02  1.623e-02   4.819 1.46e-06 ***
## ws_avg              -4.139e-02  2.349e-03 -17.619  < 2e-16 ***
## I(1/dist_dc^2)       1.592e-03  2.785e-04   5.716 1.11e-08 ***
## monthly_oil_2km     -1.465e-05  4.900e-06  -2.990 0.002798 ** 
## active_2km           2.956e-02  3.023e-03   9.778  < 2e-16 ***
## inactive_2km        -2.037e-02  6.992e-03  -2.913 0.003587 ** 
## elevation           -1.096e-02  3.387e-03  -3.237 0.001213 ** 
## EVI                 -2.610e+00  1.135e-01 -22.999  < 2e-16 ***
## num_odor_complaints  1.765e-02  7.066e-04  24.978  < 2e-16 ***
## daily_downwind_ref  -5.012e-02  1.667e-02  -3.007 0.002644 ** 
## daily_temp          -5.042e-03  1.411e-03  -3.574 0.000353 ***
## daily_hum           -1.197e-02  3.847e-04 -31.115  < 2e-16 ***
## daily_precip        -1.284e-01  2.729e-02  -4.704 2.57e-06 ***
## disaster             1.029e+00  4.399e-02  23.382  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.865   8.00 48.91
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.854   8.96 59.43
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.427  80.00 74.35
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 119/120
## R-sq.(adj) =  0.559   Deviance explained = 56.2%
## GCV = 0.2936  Scale est. = 0.29138   n = 15595

Everything w.o Disaster Indicator

# Everything
summary(log_da_full_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + ws_avg + I(1/dist_dc^2) + 
##     monthly_oil_2km + active_2km + inactive_2km + elevation + 
##     EVI + num_odor_complaints + daily_downwind_ref + daily_temp + 
##     daily_hum + daily_precip
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.093e+00  1.372e-01   7.965 1.77e-15 ***
## year2021             2.027e-01  4.294e-02   4.720 2.38e-06 ***
## year2022             6.589e-02  5.685e-02   1.159  0.24653    
## year2023             2.483e-01  5.967e-02   4.161 3.19e-05 ***
## weekdayMon           1.069e-01  1.656e-02   6.455 1.12e-10 ***
## weekdayTue           1.486e-01  1.651e-02   8.997  < 2e-16 ***
## weekdayWed           1.969e-01  1.650e-02  11.933  < 2e-16 ***
## weekdayThu           1.764e-01  1.651e-02  10.684  < 2e-16 ***
## weekdayFri           1.591e-01  1.649e-02   9.650  < 2e-16 ***
## weekdaySat           8.031e-02  1.652e-02   4.862 1.17e-06 ***
## ws_avg              -4.308e-02  2.389e-03 -18.032  < 2e-16 ***
## I(1/dist_dc^2)       1.503e-03  2.365e-04   6.355 2.15e-10 ***
## monthly_oil_2km     -1.173e-05  4.979e-06  -2.355  0.01855 *  
## active_2km           1.887e-02  3.033e-03   6.220 5.08e-10 ***
## inactive_2km        -7.388e-04  7.016e-03  -0.105  0.91613    
## elevation           -1.597e-02  3.432e-03  -4.653 3.31e-06 ***
## EVI                 -2.432e+00  1.149e-01 -21.159  < 2e-16 ***
## num_odor_complaints  1.953e-02  7.144e-04  27.338  < 2e-16 ***
## daily_downwind_ref  -5.271e-02  1.696e-02  -3.108  0.00189 ** 
## daily_temp          -3.894e-03  1.434e-03  -2.715  0.00663 ** 
## daily_hum           -1.193e-02  3.913e-04 -30.489  < 2e-16 ***
## daily_precip        -1.368e-01  2.777e-02  -4.925 8.53e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.765   8.00 39.74
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.653   8.89 48.04
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.455  80.00 72.57
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 118/119
## R-sq.(adj) =  0.543   Deviance explained = 54.7%
## GCV = 0.30403  Scale est. = 0.30175   n = 15595

Daily Max

Since February 2022

# Since feb 2022
summary(dm_sincefeb2022_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + monthly_oil_2km + active_2km + 
##     num_odor_complaints + daily_downwind_wrp + daily_temp + daily_hum + 
##     daily_precip
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.926e+00  9.735e-01   3.005 0.002662 ** 
## year2023            -1.214e+00  2.950e-01  -4.115 3.92e-05 ***
## weekdayMon           5.560e-01  1.546e-01   3.597 0.000324 ***
## weekdayTue           9.163e-01  1.544e-01   5.936 3.07e-09 ***
## weekdayWed           8.815e-01  1.545e-01   5.706 1.21e-08 ***
## weekdayThu           7.475e-01  1.546e-01   4.834 1.37e-06 ***
## weekdayFri           8.450e-01  1.542e-01   5.479 4.44e-08 ***
## weekdaySat           3.239e-01  1.543e-01   2.099 0.035896 *  
## wd_avg               2.251e-03  5.606e-04   4.016 6.00e-05 ***
## ws_avg              -2.553e-01  3.107e-02  -8.217 2.50e-16 ***
## I(1/dist_wrp^2)     -1.883e-07  1.719e-05  -0.011 0.991260    
## I(1/dist_ref^2)      3.867e-03  7.315e-04   5.286 1.29e-07 ***
## monthly_oil_2km      1.883e-04  4.105e-05   4.587 4.57e-06 ***
## active_2km          -7.997e-02  1.417e-02  -5.642 1.75e-08 ***
## num_odor_complaints  1.415e-01  1.975e-02   7.165 8.62e-13 ***
## daily_downwind_wrp   4.322e-01  1.856e-01   2.328 0.019931 *  
## daily_temp           4.145e-02  1.360e-02   3.047 0.002318 ** 
## daily_hum           -3.255e-02  3.887e-03  -8.373  < 2e-16 ***
## daily_precip         6.498e-01  2.383e-01   2.727 0.006416 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                              edf Ref.df     F
## s(as.numeric(month))                                     0.04726  8.000 0.006
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.87275  8.977 8.416
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 77.91134 80.000 3.386
##                                                         p-value    
## s(as.numeric(month))                                      0.363    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 114/116
## R-sq.(adj) =  0.219   Deviance explained = 23.1%
## GCV = 11.187  Scale est. = 11.009    n = 6531

Disaster Only

# Disaster only
summary(dm_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", 
##     k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##     k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + inactive_2km + num_odor_complaints + daily_hum
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.849e+02  9.013e+01   3.161  0.00161 ** 
## wd_avg              -5.063e-01  1.611e-01  -3.143  0.00172 ** 
## ws_avg               4.253e+01  1.327e+01   3.206  0.00138 ** 
## I(1/dist_wrp^2)     -9.204e-04  4.076e-04  -2.258  0.02413 *  
## I(1/dist_ref^2)      1.050e-02  4.707e-03   2.231  0.02589 *  
## I(1/dist_dc^2)       2.830e+00  1.205e+00   2.350  0.01895 *  
## monthly_oil_2km     -3.934e-02  1.377e-02  -2.858  0.00434 ** 
## inactive_2km         3.257e+01  1.364e+01   2.388  0.01708 *  
## num_odor_complaints -8.312e+00  1.257e+00  -6.614 5.65e-11 ***
## daily_hum           -1.896e+00  8.990e-01  -2.109  0.03519 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.359  8.855 3.374
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 68.202 80.000 8.513
##                                                          p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  0.000459 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 96/99
## R-sq.(adj) =  0.432   Deviance explained = 46.9%
## GCV = 2.8585e+05  Scale est. = 2.6709e+05  n = 1273

Exclude Disaster

# Exclude disaster
summary(dm_excl_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + EVI + num_odor_complaints + 
##     daily_temp + daily_hum
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.003e+00  9.042e-01   3.321 0.000899 ***
## year2021            -2.472e-01  3.861e-01  -0.640 0.522146    
## year2022             3.457e-01  5.820e-01   0.594 0.552551    
## year2023             1.041e-01  5.522e-01   0.188 0.850526    
## weekdayMon           5.609e-01  1.318e-01   4.255 2.10e-05 ***
## weekdayTue           9.562e-01  1.312e-01   7.286 3.37e-13 ***
## weekdayWed           8.571e-01  1.312e-01   6.533 6.68e-11 ***
## weekdayThu           7.787e-01  1.313e-01   5.932 3.05e-09 ***
## weekdayFri           7.438e-01  1.313e-01   5.665 1.50e-08 ***
## weekdaySat           3.121e-01  1.314e-01   2.375 0.017559 *  
## wd_avg               1.151e-03  4.524e-04   2.545 0.010934 *  
## ws_avg              -1.008e-01  1.799e-02  -5.602 2.15e-08 ***
## I(1/dist_wrp^2)      3.888e-06  6.621e-07   5.873 4.38e-09 ***
## I(1/dist_ref^2)      2.305e-05  2.477e-06   9.306  < 2e-16 ***
## EVI                 -3.008e+00  3.765e-01  -7.988 1.48e-15 ***
## num_odor_complaints  7.779e-02  1.000e-02   7.779 7.82e-15 ***
## daily_temp           2.687e-02  1.176e-02   2.285 0.022318 *  
## daily_hum           -3.101e-02  3.231e-03  -9.599  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     6.744      8  5.513
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 13.385
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 61.906     80  4.086
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 113/115
## R-sq.(adj) =  0.123   Deviance explained = 12.9%
## GCV = 17.651  Scale est. = 17.536    n = 14322

Everything w Disaster Indicator

# Disaster indicator
summary(dm_dis_ind_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + wd_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     daily_downwind_ref + daily_downwind_wrp + daily_hum + disaster
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -9.256e+02  1.319e+02  -7.018 2.34e-12 ***
## year2021              8.862e+01  1.774e+01   4.995 5.96e-07 ***
## year2022              1.069e+02  2.292e+01   4.667 3.09e-06 ***
## year2023              7.281e+01  2.119e+01   3.437 0.000590 ***
## wd_avg               -8.471e-02  1.865e-02  -4.542 5.62e-06 ***
## I(1/dist_wrp^2)      -1.955e-04  4.146e-05  -4.715 2.44e-06 ***
## I(1/dist_ref^2)      -5.882e-03  1.793e-03  -3.280 0.001041 ** 
## I(1/dist_dc^2)        8.348e-01  1.540e-01   5.421 6.00e-08 ***
## monthly_oil_2km       2.020e-03  1.952e-03   1.035 0.300864    
## active_2km            5.716e+00  8.784e-01   6.508 7.88e-11 ***
## elevation            -5.116e+00  1.018e+00  -5.027 5.04e-07 ***
## EVI                  -2.694e+02  4.528e+01  -5.949 2.75e-09 ***
## num_odor_complaints   5.403e+00  2.405e-01  22.461  < 2e-16 ***
## closest_wrp_capacity  1.979e+00  2.702e-01   7.323 2.55e-13 ***
## daily_downwind_ref   -2.587e+01  5.710e+00  -4.531 5.91e-06 ***
## daily_downwind_wrp    8.655e+00  6.171e+00   1.403 0.160756    
## daily_hum            -4.561e-01  1.175e-01  -3.883 0.000104 ***
## disaster              1.081e+02  1.469e+01   7.359 1.95e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.304  8.000 4.641
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.912  8.976 7.470
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 74.237 80.000 8.796
##                                                          p-value    
## s(as.numeric(month))                                    1.27e-06 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 112/115
## R-sq.(adj) =  0.124   Deviance explained = 12.9%
## GCV =  34218  Scale est. = 33986     n = 15595

Everything w.o Disaster Indicator

# Everything
summary(dm_full_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + wd_avg + ws_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     daily_downwind_ref + daily_downwind_wrp + daily_hum
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -7.412e+02  1.290e+02  -5.746 9.32e-09 ***
## year2021              1.055e+01  1.424e+01   0.741   0.4586    
## year2022              9.823e+00  1.867e+01   0.526   0.5989    
## year2023              8.893e+00  1.938e+01   0.459   0.6464    
## wd_avg               -8.519e-02  1.873e-02  -4.547 5.48e-06 ***
## ws_avg                1.243e+00  7.826e-01   1.588   0.1123    
## I(1/dist_wrp^2)      -1.791e-04  4.252e-05  -4.213 2.54e-05 ***
## I(1/dist_ref^2)      -4.456e-03  1.788e-03  -2.492   0.0127 *  
## I(1/dist_dc^2)        6.850e-01  1.514e-01   4.526 6.07e-06 ***
## monthly_oil_2km       2.394e-03  1.949e-03   1.229   0.2193    
## active_2km            5.035e+00  8.745e-01   5.757 8.72e-09 ***
## elevation            -5.010e+00  1.021e+00  -4.908 9.31e-07 ***
## EVI                  -2.347e+02  4.516e+01  -5.198 2.04e-07 ***
## num_odor_complaints   5.605e+00  2.395e-01  23.407  < 2e-16 ***
## closest_wrp_capacity  1.881e+00  2.690e-01   6.993 2.81e-12 ***
## daily_downwind_ref   -2.669e+01  5.720e+00  -4.666 3.10e-06 ***
## daily_downwind_wrp    8.224e+00  6.182e+00   1.330   0.1834    
## daily_hum            -4.651e-01  1.177e-01  -3.951 7.82e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.306  8.000 5.365
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.910  8.976 7.353
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 74.029 80.000 8.445
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 112/115
## R-sq.(adj) =  0.121   Deviance explained = 12.6%
## GCV =  34333  Scale est. = 34102     n = 15595

Log Daily Max

Since February 2022

# Since feb 2022
summary(log_dm_sincefeb2022_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_gas_2km + elevation + 
##     EVI + num_odor_complaints + daily_temp + daily_hum
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.492e+00  2.501e-01   9.961  < 2e-16 ***
## year2023            -2.048e-01  5.484e-02  -3.735 0.000189 ***
## weekdayMon           2.610e-01  2.900e-02   8.998  < 2e-16 ***
## weekdayTue           3.504e-01  2.896e-02  12.100  < 2e-16 ***
## weekdayWed           3.293e-01  2.899e-02  11.357  < 2e-16 ***
## weekdayThu           2.486e-01  2.901e-02   8.569  < 2e-16 ***
## weekdayFri           2.603e-01  2.893e-02   8.996  < 2e-16 ***
## weekdaySat           1.713e-01  2.896e-02   5.915 3.48e-09 ***
## wd_avg               3.861e-04  1.047e-04   3.688 0.000228 ***
## ws_avg              -1.254e-01  5.460e-03 -22.974  < 2e-16 ***
## I(1/dist_ref^2)      4.423e-05  8.187e-06   5.402 6.82e-08 ***
## I(1/dist_dc^2)      -1.144e-04  1.826e-04  -0.626 0.531126    
## monthly_gas_2km      2.578e-05  1.841e-05   1.400 0.161627    
## elevation           -5.437e-02  8.928e-03  -6.090 1.20e-09 ***
## EVI                 -1.393e+00  1.273e-01 -10.943  < 2e-16 ***
## num_odor_complaints  2.723e-02  3.708e-03   7.344 2.33e-13 ***
## daily_temp           8.807e-03  2.523e-03   3.490 0.000486 ***
## daily_hum           -1.382e-02  7.107e-04 -19.451  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df     F
## s(as.numeric(month))                                    3.450e-06      8  0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  9.000e+00      9 25.34
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.904e+01     80 18.59
##                                                         p-value    
## s(as.numeric(month))                                      0.707    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 113/115
## R-sq.(adj) =  0.543   Deviance explained = 55.1%
## GCV = 0.39402  Scale est. = 0.38774   n = 6531

Disaster Only

# Disaster only
summary(log_dm_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_max) ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp", 
##         "cc")) + month + weekday + wd_avg + ws_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + monthly_gas_2km + 
##     inactive_2km + elevation + num_odor_complaints + daily_temp + 
##     daily_hum
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          6.788e+00  8.089e-01   8.391  < 2e-16 ***
## month11             -1.956e-01  1.658e-01  -1.180 0.238194    
## month12             -4.188e-01  1.844e-01  -2.271 0.023313 *  
## weekdayMon           1.455e-01  8.201e-02   1.775 0.076195 .  
## weekdayTue          -8.291e-02  8.401e-02  -0.987 0.323859    
## weekdayWed           4.878e-02  8.157e-02   0.598 0.549949    
## weekdayThu           1.501e-01  8.180e-02   1.835 0.066792 .  
## weekdayFri          -2.089e-01  7.977e-02  -2.619 0.008934 ** 
## weekdaySat          -1.335e-01  8.061e-02  -1.657 0.097819 .  
## wd_avg               2.471e-04  2.384e-04   1.037 0.300113    
## ws_avg              -1.441e-01  2.116e-02  -6.809 1.56e-11 ***
## I(1/dist_wrp^2)     -3.130e-06  7.624e-07  -4.105 4.31e-05 ***
## I(1/dist_ref^2)      5.212e-05  7.485e-06   6.964 5.48e-12 ***
## I(1/dist_dc^2)       1.662e-02  2.265e-03   7.340 3.96e-13 ***
## monthly_oil_2km      4.527e-04  6.189e-05   7.314 4.74e-13 ***
## monthly_gas_2km     -2.848e-03  3.507e-04  -8.119 1.17e-15 ***
## inactive_2km         8.977e-02  2.344e-02   3.830 0.000135 ***
## elevation           -2.525e-01  2.570e-02  -9.826  < 2e-16 ***
## num_odor_complaints  8.459e-03  1.720e-03   4.919 9.90e-07 ***
## daily_temp           7.311e-03  8.888e-03   0.823 0.410953    
## daily_hum           -6.435e-03  1.733e-03  -3.712 0.000215 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.938  8.991 15.488
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 56.931 80.000  7.708
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 107/110
## R-sq.(adj) =  0.725   Deviance explained = 74.3%
## GCV =  0.621  Scale est. = 0.58009   n = 1273

Exclude Disaster

# Exclude disaster
summary(log_dm_excl_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_ref^2) + monthly_gas_2km + active_2km + inactive_2km + 
##     elevation + EVI + num_odor_complaints + daily_downwind_wrp + 
##     daily_temp + daily_hum + daily_precip
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.085e+00  1.750e-01   6.201 5.76e-10 ***
## year2021            -3.869e-02  6.140e-02  -0.630  0.52867    
## year2022            -1.362e-01  9.715e-02  -1.402  0.16104    
## year2023            -1.666e-02  9.364e-02  -0.178  0.85882    
## weekdayMon           2.610e-01  2.076e-02  12.570  < 2e-16 ***
## weekdayTue           3.067e-01  2.068e-02  14.835  < 2e-16 ***
## weekdayWed           3.350e-01  2.067e-02  16.210  < 2e-16 ***
## weekdayThu           2.968e-01  2.068e-02  14.351  < 2e-16 ***
## weekdayFri           2.901e-01  2.068e-02  14.024  < 2e-16 ***
## weekdaySat           1.532e-01  2.069e-02   7.401 1.43e-13 ***
## wd_avg               1.833e-04  7.184e-05   2.551  0.01075 *  
## ws_avg              -5.395e-02  2.933e-03 -18.392  < 2e-16 ***
## I(1/dist_ref^2)      1.829e-05  1.234e-05   1.482  0.13843    
## monthly_gas_2km      4.008e-06  1.524e-05   0.263  0.79261    
## active_2km           2.131e-02  3.587e-03   5.943 2.87e-09 ***
## inactive_2km        -2.208e-02  8.378e-03  -2.635  0.00842 ** 
## elevation           -2.023e-02  4.118e-03  -4.912 9.13e-07 ***
## EVI                 -2.105e+00  1.230e-01 -17.113  < 2e-16 ***
## num_odor_complaints  1.221e-02  1.592e-03   7.667 1.87e-14 ***
## daily_downwind_wrp   5.050e-02  2.300e-02   2.196  0.02809 *  
## daily_temp           1.384e-02  1.876e-03   7.377 1.71e-13 ***
## daily_hum           -1.262e-02  5.214e-04 -24.200  < 2e-16 ***
## daily_precip        -1.039e-01  3.974e-02  -2.616  0.00891 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.695      8 44.12
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 34.33
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.037     80 38.84
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 119/120
## R-sq.(adj) =   0.48   Deviance explained = 48.4%
## GCV = 0.43856  Scale est. = 0.43496   n = 14322

Everything w Disaster Indicator

# Disaster indicator
summary(log_dm_dis_ind_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + ws_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + elevation + EVI + num_odor_complaints + 
##     closest_wrp_capacity + daily_downwind_ref + daily_temp + 
##     daily_hum + daily_precip + disaster
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.046e+00  2.634e-01   7.768 8.45e-15 ***
## year2021              1.117e+00  6.996e-02  15.972  < 2e-16 ***
## year2022              1.134e+00  9.109e-02  12.447  < 2e-16 ***
## year2023              8.490e-01  8.407e-02  10.098  < 2e-16 ***
## weekdayMon            2.415e-01  2.160e-02  11.180  < 2e-16 ***
## weekdayTue            2.645e-01  2.153e-02  12.285  < 2e-16 ***
## weekdayWed            3.078e-01  2.152e-02  14.305  < 2e-16 ***
## weekdayThu            2.760e-01  2.154e-02  12.817  < 2e-16 ***
## weekdayFri            2.352e-01  2.150e-02  10.939  < 2e-16 ***
## weekdaySat            1.184e-01  2.154e-02   5.495 3.98e-08 ***
## ws_avg               -5.286e-02  3.113e-03 -16.983  < 2e-16 ***
## I(1/dist_wrp^2)      -4.773e-01  2.916e-02 -16.369  < 2e-16 ***
## I(1/dist_ref^2)       3.885e-04  2.803e-05  13.858  < 2e-16 ***
## I(1/dist_dc^2)        4.243e+03  2.592e+02  16.369  < 2e-16 ***
## elevation            -1.904e-02  3.851e-03  -4.943 7.76e-07 ***
## EVI                  -1.362e+00  7.380e-02 -18.452  < 2e-16 ***
## num_odor_complaints   2.397e-02  9.373e-04  25.574  < 2e-16 ***
## closest_wrp_capacity -3.557e-03  3.529e-04 -10.078  < 2e-16 ***
## daily_downwind_ref    1.444e-02  2.221e-02   0.650   0.5157    
## daily_temp           -1.663e-03  1.870e-03  -0.889   0.3740    
## daily_hum            -1.380e-02  5.102e-04 -27.052  < 2e-16 ***
## daily_precip         -9.858e-02  3.622e-02  -2.722   0.0065 ** 
## disaster              1.340e+00  5.767e-02  23.232  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.837  8.000 47.24
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.732  8.937 39.83
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 78.984 80.000 32.38
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 118/120
## R-sq.(adj) =  0.508   Deviance explained = 51.2%
## GCV = 0.51707  Scale est. = 0.5132    n = 15595

Everything w.o Disaster Indicator

# Everything
summary(log_dm_full_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + elevation + 
##     EVI + num_odor_complaints + closest_wrp_capacity + daily_downwind_ref + 
##     daily_temp + daily_hum + daily_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.800e+00  2.568e-01  14.797  < 2e-16 ***
## year2021              1.321e-01  5.664e-02   2.332  0.01971 *  
## year2022             -1.082e-01  7.504e-02  -1.442  0.14929    
## year2023              4.617e-02  7.793e-02   0.592  0.55354    
## weekdayMon            2.420e-01  2.199e-02  11.008  < 2e-16 ***
## weekdayTue            2.658e-01  2.191e-02  12.133  < 2e-16 ***
## weekdayWed            3.094e-01  2.189e-02  14.135  < 2e-16 ***
## weekdayThu            2.757e-01  2.191e-02  12.584  < 2e-16 ***
## weekdayFri            2.378e-01  2.187e-02  10.871  < 2e-16 ***
## weekdaySat            1.210e-01  2.192e-02   5.522 3.40e-08 ***
## wd_avg                9.051e-05  7.467e-05   1.212  0.22553    
## ws_avg               -5.582e-02  3.177e-03 -17.569  < 2e-16 ***
## I(1/dist_wrp^2)      -4.951e-01  2.969e-02 -16.679  < 2e-16 ***
## I(1/dist_ref^2)      -5.392e-02  3.235e-03 -16.668  < 2e-16 ***
## I(1/dist_dc^2)        4.393e+03  2.634e+02  16.679  < 2e-16 ***
## elevation            -2.286e-02  3.899e-03  -5.863 4.63e-09 ***
## EVI                  -1.384e+00  7.443e-02 -18.595  < 2e-16 ***
## num_odor_complaints   2.642e-02  9.473e-04  27.893  < 2e-16 ***
## closest_wrp_capacity -3.463e-03  3.563e-04  -9.720  < 2e-16 ***
## daily_downwind_ref    9.056e-03  2.261e-02   0.400  0.68880    
## daily_temp           -2.244e-04  1.908e-03  -0.118  0.90640    
## daily_hum            -1.376e-02  5.219e-04 -26.365  < 2e-16 ***
## daily_precip         -1.042e-01  3.700e-02  -2.815  0.00488 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                           edf Ref.df     F
## s(as.numeric(month))                                     7.79  8.000 34.03
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.52  8.848 32.60
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 78.68 80.000 31.48
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 118/120
## R-sq.(adj) =  0.491   Deviance explained = 49.4%
## GCV = 0.53522  Scale est. = 0.53124   n = 15595

Hourly Avg

Since February 2022

# Since feb 2022
summary(ha_sincefeb2022_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + 
##     monthly_gas_2km + active_2km + inactive_2km + elevation + 
##     EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.725e+00  1.545e-01  37.063  < 2e-16 ***
## year2023             -1.359e-01  1.555e-02  -8.744  < 2e-16 ***
## weekdayMon            7.473e-02  5.572e-03  13.412  < 2e-16 ***
## weekdayTue            1.400e-01  5.523e-03  25.345  < 2e-16 ***
## weekdayWed            1.484e-01  5.536e-03  26.810  < 2e-16 ***
## weekdayThu            1.079e-01  5.540e-03  19.477  < 2e-16 ***
## weekdayFri            1.246e-01  5.519e-03  22.581  < 2e-16 ***
## weekdaySat            6.291e-02  5.504e-03  11.431  < 2e-16 ***
## wd_avg               -2.020e-04  1.771e-05 -11.406  < 2e-16 ***
## ws_avg               -4.966e-02  6.923e-04 -71.727  < 2e-16 ***
## I(1/dist_wrp^2)       1.073e-06  6.572e-08  16.328  < 2e-16 ***
## I(1/dist_ref^2)       9.215e-06  7.250e-07  12.709  < 2e-16 ***
## I(1/dist_dc^2)       -4.244e-04  1.158e-05 -36.634  < 2e-16 ***
## monthly_oil_2km       7.496e-06  2.163e-06   3.465  0.00053 ***
## monthly_gas_2km      -3.754e-05  6.336e-06  -5.925 3.14e-09 ***
## active_2km            1.434e-02  1.733e-03   8.273  < 2e-16 ***
## inactive_2km         -3.426e-02  4.717e-03  -7.264 3.78e-13 ***
## elevation            -3.074e-02  3.480e-03  -8.835  < 2e-16 ***
## EVI                  -1.491e+00  7.683e-02 -19.410  < 2e-16 ***
## num_odor_complaints   1.309e-02  7.212e-04  18.150  < 2e-16 ***
## closest_wrp_capacity -2.882e-03  4.003e-04  -7.201 6.03e-13 ***
## hourly_downwind_ref  -2.759e-02  5.412e-03  -5.097 3.46e-07 ***
## hourly_downwind_wrp   1.955e-02  6.256e-03   3.125  0.00178 ** 
## hourly_temp          -3.226e-02  3.315e-04 -97.325  < 2e-16 ***
## hourly_hum           -9.507e-03  1.176e-04 -80.843  < 2e-16 ***
## hourly_precip         4.669e-01  1.008e-01   4.632 3.62e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.863   8.00 108.1
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.260   8.26 184.5
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.990  80.00 289.6
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 119/123
## R-sq.(adj) =  0.351   Deviance explained = 35.2%
## GCV = 0.33385  Scale est. = 0.33359   n = 153718

Disaster Only

# Disaster only
summary(ha_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_avg ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", 
##     k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##     k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + month + 
##     weekday + wd_avg + ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + 
##     I(1/dist_dc^2) + monthly_oil_2km + monthly_gas_2km + active_2km + 
##     inactive_2km + elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.439e+02  2.504e+02   1.373 0.169667    
## month11              -5.068e+00  5.033e+00  -1.007 0.313944    
## month12              -1.720e+00  5.452e+00  -0.316 0.752358    
## weekdayMon           -3.528e+00  2.586e+00  -1.364 0.172618    
## weekdayTue           -6.387e+00  2.595e+00  -2.461 0.013857 *  
## weekdayWed           -1.574e+00  2.562e+00  -0.615 0.538825    
## weekdayThu            1.794e+00  2.558e+00   0.701 0.483077    
## weekdayFri            2.918e+00  2.502e+00   1.166 0.243428    
## weekdaySat            2.057e+00  2.531e+00   0.813 0.416351    
## wd_avg               -2.871e-02  6.943e-03  -4.135 3.56e-05 ***
## ws_avg                1.454e+00  3.702e-01   3.929 8.55e-05 ***
## I(1/dist_wrp^2)      -6.655e-04  1.696e-04  -3.925 8.71e-05 ***
## I(1/dist_ref^2)      -1.667e-03  2.781e-03  -0.599 0.548960    
## I(1/dist_dc^2)        5.893e+00  1.665e+00   3.539 0.000402 ***
## monthly_oil_2km      -1.799e-02  8.448e-03  -2.130 0.033219 *  
## monthly_gas_2km      -2.634e-02  2.249e-02  -1.172 0.241372    
## active_2km           -1.138e+01  1.964e+00  -5.795 6.91e-09 ***
## inactive_2km          7.207e+01  6.829e+00  10.554  < 2e-16 ***
## elevation            -3.683e+01  1.824e+00 -20.188  < 2e-16 ***
## EVI                  -6.007e+02  4.246e+01 -14.146  < 2e-16 ***
## num_odor_complaints  -1.253e+00  6.419e-02 -19.512  < 2e-16 ***
## closest_wrp_capacity  1.175e+00  4.648e-01   2.527 0.011497 *  
## hourly_downwind_wrp   7.409e+00  2.649e+00   2.797 0.005164 ** 
## hourly_temp          -8.394e-01  1.383e-01  -6.071 1.29e-09 ***
## hourly_hum           -1.086e-01  4.156e-02  -2.613 0.008980 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.707  8.932 83.18
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 78.390 80.000 63.25
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 111/114
## R-sq.(adj) =   0.21   Deviance explained = 21.3%
## GCV =  13802  Scale est. = 13753     n = 30242

Exclude Disaster

# Exclude disaster
summary(ha_excl_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_gas_2km + 
##     active_2km + inactive_2km + elevation + EVI + num_odor_complaints + 
##     closest_wrp_capacity + hourly_downwind_ref + hourly_downwind_wrp + 
##     hourly_temp + hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           4.693e+00  1.197e-01   39.205  < 2e-16 ***
## year2021              5.288e-02  1.425e-02    3.710 0.000207 ***
## year2022              6.716e-01  2.260e-02   29.721  < 2e-16 ***
## year2023              5.851e-01  2.185e-02   26.781  < 2e-16 ***
## weekdayMon            8.528e-02  4.864e-03   17.533  < 2e-16 ***
## weekdayTue            1.443e-01  4.824e-03   29.916  < 2e-16 ***
## weekdayWed            1.673e-01  4.819e-03   34.710  < 2e-16 ***
## weekdayThu            1.531e-01  4.818e-03   31.776  < 2e-16 ***
## weekdayFri            1.415e-01  4.820e-03   29.355  < 2e-16 ***
## weekdaySat            6.738e-02  4.811e-03   14.006  < 2e-16 ***
## wd_avg               -3.936e-04  1.529e-05  -25.741  < 2e-16 ***
## ws_avg               -3.184e-02  5.121e-04  -62.177  < 2e-16 ***
## I(1/dist_wrp^2)       3.672e-07  1.137e-07    3.230 0.001236 ** 
## I(1/dist_ref^2)       1.079e-06  5.761e-06    0.187 0.851432    
## I(1/dist_dc^2)        7.839e-04  7.907e-05    9.913  < 2e-16 ***
## monthly_gas_2km      -4.583e-05  3.725e-06  -12.303  < 2e-16 ***
## active_2km            1.242e-02  8.628e-04   14.400  < 2e-16 ***
## inactive_2km         -1.488e-02  2.657e-03   -5.602 2.12e-08 ***
## elevation            -8.347e-03  1.114e-03   -7.493 6.74e-14 ***
## EVI                  -1.439e+00  4.685e-02  -30.713  < 2e-16 ***
## num_odor_complaints   5.676e-03  3.750e-04   15.137  < 2e-16 ***
## closest_wrp_capacity -2.247e-03  2.699e-04   -8.326  < 2e-16 ***
## hourly_downwind_ref  -5.162e-02  4.624e-03  -11.164  < 2e-16 ***
## hourly_downwind_wrp   2.443e-02  5.112e-03    4.779 1.76e-06 ***
## hourly_temp          -3.371e-02  2.802e-04 -120.339  < 2e-16 ***
## hourly_hum           -9.323e-03  9.589e-05  -97.234  < 2e-16 ***
## hourly_precip        -4.115e-01  1.143e-01   -3.599 0.000319 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.982      8 589.0
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 379.2
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.922     80 404.6
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 121/124
## R-sq.(adj) =  0.248   Deviance explained = 24.8%
## GCV = 0.55993  Scale est. = 0.55973   n = 337596

Everything w Disaster Indicator

# Disaster indicator
summary(ha_dis_ind_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + 
##     monthly_gas_2km + active_2km + inactive_2km + elevation + 
##     EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip + 
##     disaster
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.655e+02  6.608e+00 -25.044  < 2e-16 ***
## year2021              9.738e+00  7.510e-01  12.967  < 2e-16 ***
## year2022              1.244e+01  9.733e-01  12.786  < 2e-16 ***
## year2023              7.386e+00  9.119e-01   8.100 5.51e-16 ***
## weekdayMon           -7.005e-01  2.303e-01  -3.041  0.00236 ** 
## weekdayTue           -6.643e-01  2.285e-01  -2.907  0.00365 ** 
## weekdayWed           -3.712e-01  2.282e-01  -1.626  0.10390    
## weekdayThu           -1.339e-01  2.282e-01  -0.587  0.55732    
## weekdayFri            1.326e-01  2.279e-01   0.582  0.56076    
## weekdaySat            6.125e-02  2.278e-01   0.269  0.78807    
## wd_avg               -7.282e-03  7.130e-04 -10.214  < 2e-16 ***
## ws_avg                1.617e-01  2.459e-02   6.575 4.86e-11 ***
## I(1/dist_wrp^2)       4.612e-06  4.095e-06   1.126  0.26007    
## I(1/dist_ref^2)      -3.239e-03  2.064e-04 -15.691  < 2e-16 ***
## I(1/dist_dc^2)        2.534e-01  1.217e-02  20.815  < 2e-16 ***
## monthly_oil_2km       3.964e-04  8.839e-05   4.485 7.30e-06 ***
## monthly_gas_2km       5.471e-05  1.889e-04   0.290  0.77207    
## active_2km            3.530e-01  4.227e-02   8.349  < 2e-16 ***
## inactive_2km          2.244e+00  1.188e-01  18.885  < 2e-16 ***
## elevation            -1.248e+00  5.320e-02 -23.453  < 2e-16 ***
## EVI                  -5.567e+01  2.181e+00 -25.528  < 2e-16 ***
## num_odor_complaints   5.717e-01  1.004e-02  56.940  < 2e-16 ***
## closest_wrp_capacity  4.143e-01  1.451e-02  28.545  < 2e-16 ***
## hourly_downwind_ref  -2.019e+00  2.153e-01  -9.378  < 2e-16 ***
## hourly_downwind_wrp   1.544e+00  2.415e-01   6.394 1.62e-10 ***
## hourly_temp          -2.308e-01  1.278e-02 -18.062  < 2e-16 ***
## hourly_hum           -5.509e-02  4.338e-03 -12.699  < 2e-16 ***
## hourly_precip        -3.925e-01  4.986e+00  -0.079  0.93725    
## disaster              1.251e+01  6.201e-01  20.177  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.933      8 45.69
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 93.50
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 78.186     80 78.08
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 123/126
## R-sq.(adj) =  0.0486   Deviance explained = 4.89%
## GCV = 1367.9  Scale est. = 1367.5    n = 367838

Everything w.o Disaster Indicator

# Everything
summary(ha_full_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + 
##     monthly_gas_2km + active_2km + inactive_2km + elevation + 
##     EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.493e+02  6.542e+00 -22.826  < 2e-16 ***
## year2021              3.942e-01  5.909e-01   0.667  0.50461    
## year2022              7.027e-01  7.793e-01   0.902  0.36717    
## year2023             -4.345e-01  8.210e-01  -0.529  0.59662    
## weekdayMon           -7.023e-01  2.305e-01  -3.047  0.00231 ** 
## weekdayTue           -6.474e-01  2.286e-01  -2.832  0.00463 ** 
## weekdayWed           -3.577e-01  2.284e-01  -1.566  0.11723    
## weekdayThu           -1.384e-01  2.283e-01  -0.606  0.54425    
## weekdayFri            1.508e-01  2.281e-01   0.661  0.50841    
## weekdaySat            8.398e-02  2.280e-01   0.368  0.71257    
## wd_avg               -7.299e-03  7.134e-04 -10.233  < 2e-16 ***
## ws_avg                1.466e-01  2.459e-02   5.961 2.51e-09 ***
## I(1/dist_wrp^2)       5.719e-06  3.854e-06   1.484  0.13783    
## I(1/dist_ref^2)      -3.114e-03  1.960e-04 -15.884  < 2e-16 ***
## I(1/dist_dc^2)        2.378e-01  1.234e-02  19.263  < 2e-16 ***
## monthly_oil_2km       4.562e-04  8.775e-05   5.198 2.01e-07 ***
## monthly_gas_2km       1.570e-05  1.810e-04   0.087  0.93090    
## active_2km            2.278e-01  4.164e-02   5.470 4.50e-08 ***
## inactive_2km          2.496e+00  1.173e-01  21.285  < 2e-16 ***
## elevation            -1.319e+00  5.301e-02 -24.877  < 2e-16 ***
## EVI                  -5.419e+01  2.179e+00 -24.870  < 2e-16 ***
## num_odor_complaints   5.945e-01  9.984e-03  59.542  < 2e-16 ***
## closest_wrp_capacity  4.191e-01  1.447e-02  28.968  < 2e-16 ***
## hourly_downwind_ref  -2.047e+00  2.154e-01  -9.502  < 2e-16 ***
## hourly_downwind_wrp   1.579e+00  2.416e-01   6.535 6.37e-11 ***
## hourly_temp          -2.244e-01  1.278e-02 -17.556  < 2e-16 ***
## hourly_hum           -5.482e-02  4.341e-03 -12.628  < 2e-16 ***
## hourly_precip        -1.161e+00  4.988e+00  -0.233  0.81593    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.937      8 55.26
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 91.91
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 77.563     80 76.00
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 122/125
## R-sq.(adj) =  0.0475   Deviance explained = 4.79%
## GCV = 1369.4  Scale est. = 1369      n = 367838

Log Hourly Avg

Since February 2022

# Since feb 2022
summary(log_ha_sincefeb2022_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + 
##     monthly_gas_2km + active_2km + inactive_2km + elevation + 
##     EVI + num_odor_complaints + hourly_downwind_ref + hourly_downwind_wrp + 
##     hourly_temp + hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.336e+00  5.911e-02  56.447  < 2e-16 ***
## year2023            -9.704e-02  1.480e-02  -6.558 5.47e-11 ***
## weekdayMon           8.902e-02  5.302e-03  16.790  < 2e-16 ***
## weekdayTue           1.568e-01  5.255e-03  29.833  < 2e-16 ***
## weekdayWed           1.516e-01  5.268e-03  28.781  < 2e-16 ***
## weekdayThu           1.085e-01  5.272e-03  20.578  < 2e-16 ***
## weekdayFri           1.261e-01  5.252e-03  24.006  < 2e-16 ***
## weekdaySat           7.496e-02  5.237e-03  14.313  < 2e-16 ***
## wd_avg              -3.786e-04  1.685e-05 -22.469  < 2e-16 ***
## ws_avg              -6.109e-02  6.588e-04 -92.737  < 2e-16 ***
## I(1/dist_wrp^2)      1.258e-06  6.667e-08  18.867  < 2e-16 ***
## I(1/dist_ref^2)     -1.235e-05  1.520e-06  -8.126 4.48e-16 ***
## I(1/dist_dc^2)      -6.728e-04  2.313e-05 -29.090  < 2e-16 ***
## monthly_oil_2km     -1.150e-06  2.058e-06  -0.559   0.5764    
## monthly_gas_2km     -4.434e-05  6.029e-06  -7.354 1.93e-13 ***
## active_2km           1.721e-02  1.650e-03  10.432  < 2e-16 ***
## inactive_2km        -3.752e-03  3.427e-03  -1.095   0.2736    
## elevation           -4.561e-02  2.707e-03 -16.846  < 2e-16 ***
## EVI                 -2.375e+00  6.238e-02 -38.078  < 2e-16 ***
## num_odor_complaints  1.326e-02  6.863e-04  19.327  < 2e-16 ***
## hourly_downwind_ref -6.484e-03  5.150e-03  -1.259   0.2081    
## hourly_downwind_wrp  4.752e-02  5.953e-03   7.983 1.44e-15 ***
## hourly_temp         -3.135e-02  3.154e-04 -99.379  < 2e-16 ***
## hourly_hum          -9.800e-03  1.119e-04 -87.564  < 2e-16 ***
## hourly_precip        2.371e-01  9.590e-02   2.473   0.0134 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.878      8 144.2
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 335.7
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.994     80 843.7
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 119/122
## R-sq.(adj) =  0.547   Deviance explained = 54.7%
## GCV = 0.30229  Scale est. = 0.30206   n = 153718

Disaster Only

# Disaster only
summary(log_ha_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_avg) ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp", 
##         "cc")) + month + weekday + wd_avg + ws_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km + 
##     inactive_2km + elevation + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_ref + hourly_downwind_wrp + hourly_temp + 
##     hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2.338e+01  1.863e+00 -12.550  < 2e-16 ***
## month11               1.553e-01  3.505e-02   4.431 9.39e-06 ***
## month12              -1.110e-01  3.620e-02  -3.065  0.00218 ** 
## weekdayMon            2.516e-02  1.808e-02   1.392  0.16397    
## weekdayTue           -9.922e-02  1.814e-02  -5.470 4.54e-08 ***
## weekdayWed            1.126e-01  1.791e-02   6.291 3.20e-10 ***
## weekdayThu            9.509e-02  1.806e-02   5.265 1.41e-07 ***
## weekdayFri            3.090e-02  1.749e-02   1.767  0.07732 .  
## weekdaySat           -8.340e-02  1.769e-02  -4.714 2.44e-06 ***
## wd_avg               -7.720e-04  4.893e-05 -15.778  < 2e-16 ***
## ws_avg               -1.110e-01  2.666e-03 -41.647  < 2e-16 ***
## I(1/dist_wrp^2)      -4.535e-05  3.437e-06 -13.194  < 2e-16 ***
## I(1/dist_ref^2)      -2.659e-04  1.972e-05 -13.487  < 2e-16 ***
## I(1/dist_dc^2)        3.288e-01  2.425e-02  13.559  < 2e-16 ***
## monthly_oil_2km       6.284e-04  4.022e-05  15.627  < 2e-16 ***
## active_2km           -3.871e-01  1.253e-02 -30.882  < 2e-16 ***
## inactive_2km          6.700e-01  4.309e-02  15.551  < 2e-16 ***
## elevation            -2.089e-01  1.147e-02 -18.209  < 2e-16 ***
## num_odor_complaints   4.279e-03  4.510e-04   9.489  < 2e-16 ***
## closest_wrp_capacity  6.845e-02  3.546e-03  19.302  < 2e-16 ***
## hourly_downwind_ref  -1.641e-01  1.466e-02 -11.197  < 2e-16 ***
## hourly_downwind_wrp   2.034e-01  1.867e-02  10.898  < 2e-16 ***
## hourly_temp          -3.594e-02  9.671e-04 -37.166  < 2e-16 ***
## hourly_hum           -1.034e-02  2.939e-04 -35.177  < 2e-16 ***
## hourly_precip         1.048e+00  2.535e-01   4.134 3.57e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                           edf Ref.df      F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.00      9 250.62
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.37     80  88.83
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 111/114
## R-sq.(adj) =  0.583   Deviance explained = 58.5%
## GCV = 0.67426  Scale est. = 0.6718    n = 30242

Exclude Disaster

# Exclude disaster
summary(log_ha_excl_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + 
##     monthly_gas_2km + active_2km + inactive_2km + elevation + 
##     EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           3.438e+00  1.212e-01   28.361  < 2e-16 ***
## year2021              1.177e-01  1.190e-02    9.893  < 2e-16 ***
## year2022              5.850e-01  1.884e-02   31.048  < 2e-16 ***
## year2023              6.066e-01  1.822e-02   33.297  < 2e-16 ***
## weekdayMon            9.370e-02  4.054e-03   23.110  < 2e-16 ***
## weekdayTue            1.468e-01  4.021e-03   36.515  < 2e-16 ***
## weekdayWed            1.700e-01  4.017e-03   42.329  < 2e-16 ***
## weekdayThu            1.546e-01  4.016e-03   38.487  < 2e-16 ***
## weekdayFri            1.446e-01  4.018e-03   35.979  < 2e-16 ***
## weekdaySat            7.364e-02  4.010e-03   18.365  < 2e-16 ***
## wd_avg               -6.106e-04  1.275e-05  -47.905  < 2e-16 ***
## ws_avg               -3.803e-02  4.269e-04  -89.094  < 2e-16 ***
## I(1/dist_wrp^2)       2.227e-06  1.127e-07   19.759  < 2e-16 ***
## I(1/dist_ref^2)      -9.849e-05  4.990e-06  -19.737  < 2e-16 ***
## I(1/dist_dc^2)        1.579e-03  1.718e-04    9.192  < 2e-16 ***
## monthly_oil_2km      -7.810e-06  1.655e-06   -4.719 2.37e-06 ***
## monthly_gas_2km      -6.010e-05  3.268e-06  -18.391  < 2e-16 ***
## active_2km            2.310e-02  7.677e-04   30.086  < 2e-16 ***
## inactive_2km         -1.921e-02  2.228e-03   -8.624  < 2e-16 ***
## elevation            -2.088e-03  9.309e-04   -2.243   0.0249 *  
## EVI                  -2.289e+00  3.907e-02  -58.570  < 2e-16 ***
## num_odor_complaints   2.732e-03  3.126e-04    8.740  < 2e-16 ***
## closest_wrp_capacity -2.991e-03  2.680e-04  -11.159  < 2e-16 ***
## hourly_downwind_ref  -3.551e-02  3.854e-03   -9.212  < 2e-16 ***
## hourly_downwind_wrp   6.536e-02  4.262e-03   15.337  < 2e-16 ***
## hourly_temp          -3.147e-02  2.335e-04 -134.734  < 2e-16 ***
## hourly_hum           -8.700e-03  7.993e-05 -108.836  < 2e-16 ***
## hourly_precip        -7.798e-01  9.530e-02   -8.183 2.77e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df    F
## s(as.numeric(month))                                     7.997      8 1039
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 1335
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.987     80 1297
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 122/125
## R-sq.(adj) =  0.444   Deviance explained = 44.4%
## GCV = 0.38907  Scale est. = 0.38893   n = 337596

Everything w Disaster Indicator

# Disaster indicator
summary(log_ha_dis_ind_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + 
##     active_2km + inactive_2km + elevation + EVI + num_odor_complaints + 
##     closest_wrp_capacity + hourly_downwind_ref + hourly_downwind_wrp + 
##     hourly_temp + hourly_hum + hourly_precip + disaster
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)          -2.067e-02  1.212e-01   -0.171  0.86458    
## year2021              8.026e-01  1.377e-02   58.302  < 2e-16 ***
## year2022              9.504e-01  1.786e-02   53.227  < 2e-16 ***
## year2023              8.091e-01  1.674e-02   48.335  < 2e-16 ***
## weekdayMon            8.113e-02  4.203e-03   19.304  < 2e-16 ***
## weekdayTue            1.188e-01  4.169e-03   28.483  < 2e-16 ***
## weekdayWed            1.595e-01  4.164e-03   38.294  < 2e-16 ***
## weekdayThu            1.431e-01  4.163e-03   34.359  < 2e-16 ***
## weekdayFri            1.285e-01  4.159e-03   30.896  < 2e-16 ***
## weekdaySat            5.813e-02  4.157e-03   13.983  < 2e-16 ***
## wd_avg               -6.860e-04  1.301e-05  -52.730  < 2e-16 ***
## ws_avg               -4.032e-02  4.487e-04  -89.855  < 2e-16 ***
## I(1/dist_wrp^2)       1.401e-06  9.995e-08   14.014  < 2e-16 ***
## I(1/dist_ref^2)      -1.294e-04  4.461e-06  -29.013  < 2e-16 ***
## I(1/dist_dc^2)        8.653e-03  2.466e-04   35.091  < 2e-16 ***
## monthly_oil_2km      -5.838e-06  1.559e-06   -3.745  0.00018 ***
## active_2km            3.100e-02  7.878e-04   39.344  < 2e-16 ***
## inactive_2km          1.274e-03  2.267e-03    0.562  0.57408    
## elevation            -2.284e-02  9.736e-04  -23.459  < 2e-16 ***
## EVI                  -3.169e+00  3.971e-02  -79.798  < 2e-16 ***
## num_odor_complaints   1.141e-02  1.833e-04   62.257  < 2e-16 ***
## closest_wrp_capacity  3.900e-03  2.660e-04   14.661  < 2e-16 ***
## hourly_downwind_ref  -7.036e-02  3.929e-03  -17.906  < 2e-16 ***
## hourly_downwind_wrp   8.627e-02  4.403e-03   19.591  < 2e-16 ***
## hourly_temp          -3.407e-02  2.332e-04 -146.077  < 2e-16 ***
## hourly_hum           -9.317e-03  7.917e-05 -117.683  < 2e-16 ***
## hourly_precip        -8.818e-01  9.098e-02   -9.692  < 2e-16 ***
## disaster              7.991e-01  1.135e-02   70.405  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     7.971      8  570.4
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.996      9  851.2
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.931     80 1225.0
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 122/125
## R-sq.(adj) =  0.443   Deviance explained = 44.3%
## GCV = 0.45541  Scale est. = 0.45526   n = 367838

Everything w.o Disaster Indicator

# Everything
summary(log_ha_full_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + 
##     active_2km + inactive_2km + elevation + EVI + num_odor_complaints + 
##     closest_wrp_capacity + hourly_downwind_ref + hourly_downwind_wrp + 
##     hourly_temp + hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           9.748e-01  1.212e-01    8.042 8.83e-16 ***
## year2021              2.031e-01  1.089e-02   18.655  < 2e-16 ***
## year2022              1.974e-01  1.439e-02   13.718  < 2e-16 ***
## year2023              3.022e-01  1.521e-02   19.865  < 2e-16 ***
## weekdayMon            8.100e-02  4.231e-03   19.144  < 2e-16 ***
## weekdayTue            1.198e-01  4.197e-03   28.542  < 2e-16 ***
## weekdayWed            1.603e-01  4.192e-03   38.239  < 2e-16 ***
## weekdayThu            1.428e-01  4.191e-03   34.058  < 2e-16 ***
## weekdayFri            1.296e-01  4.187e-03   30.966  < 2e-16 ***
## weekdaySat            5.958e-02  4.185e-03   14.237  < 2e-16 ***
## wd_avg               -6.873e-04  1.310e-05  -52.484  < 2e-16 ***
## ws_avg               -4.127e-02  4.516e-04  -91.394  < 2e-16 ***
## I(1/dist_wrp^2)       1.724e-06  9.896e-08   17.427  < 2e-16 ***
## I(1/dist_ref^2)      -1.317e-04  4.499e-06  -29.269  < 2e-16 ***
## I(1/dist_dc^2)        7.371e-03  2.379e-04   30.983  < 2e-16 ***
## monthly_oil_2km      -2.352e-06  1.568e-06   -1.500    0.134    
## active_2km            2.248e-02  7.841e-04   28.664  < 2e-16 ***
## inactive_2km          1.950e-02  2.270e-03    8.592  < 2e-16 ***
## elevation            -2.771e-02  9.777e-04  -28.342  < 2e-16 ***
## EVI                  -3.077e+00  3.995e-02  -77.014  < 2e-16 ***
## num_odor_complaints   1.285e-02  1.833e-04   70.093  < 2e-16 ***
## closest_wrp_capacity  4.307e-03  2.678e-04   16.082  < 2e-16 ***
## hourly_downwind_ref  -7.210e-02  3.956e-03  -18.228  < 2e-16 ***
## hourly_downwind_wrp   8.823e-02  4.433e-03   19.904  < 2e-16 ***
## hourly_temp          -3.366e-02  2.347e-04 -143.410  < 2e-16 ***
## hourly_hum           -9.297e-03  7.970e-05 -116.645  < 2e-16 ***
## hourly_precip        -9.332e-01  9.158e-02  -10.190  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     7.951      8  651.1
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.997      9  775.6
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.952     80 1219.0
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 121/124
## R-sq.(adj) =  0.435   Deviance explained = 43.6%
## GCV = 0.46155  Scale est. = 0.4614    n = 367838

Hourly Max

Since February 2022

# Since feb 2022
summary(hm_sincefeb2022_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + 
##     monthly_gas_2km + inactive_2km + elevation + EVI + num_odor_complaints + 
##     closest_wrp_capacity + hourly_downwind_ref + hourly_downwind_wrp + 
##     hourly_temp + hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           7.120e+00  3.168e-01  22.474  < 2e-16 ***
## year2023             -1.849e-01  3.178e-02  -5.817 5.99e-09 ***
## weekdayMon            1.227e-01  1.143e-02  10.736  < 2e-16 ***
## weekdayTue            2.126e-01  1.133e-02  18.771  < 2e-16 ***
## weekdayWed            2.206e-01  1.136e-02  19.423  < 2e-16 ***
## weekdayThu            1.692e-01  1.136e-02  14.891  < 2e-16 ***
## weekdayFri            1.987e-01  1.132e-02  17.547  < 2e-16 ***
## weekdaySat            8.991e-02  1.129e-02   7.964 1.68e-15 ***
## wd_avg               -6.799e-05  3.632e-05  -1.872 0.061225 .  
## ws_avg               -6.652e-02  1.420e-03 -46.847  < 2e-16 ***
## I(1/dist_wrp^2)       7.978e-07  1.345e-07   5.932 3.00e-09 ***
## I(1/dist_ref^2)       2.187e-05  1.478e-06  14.797  < 2e-16 ***
## I(1/dist_dc^2)       -4.692e-04  2.367e-05 -19.828  < 2e-16 ***
## monthly_oil_2km       1.217e-05  4.157e-06   2.928 0.003411 ** 
## monthly_gas_2km       2.136e-06  1.268e-05   0.168 0.866236    
## inactive_2km         -1.685e-02  8.289e-03  -2.033 0.042025 *  
## elevation            -4.174e-02  7.125e-03  -5.857 4.71e-09 ***
## EVI                  -1.478e+00  1.340e-01 -11.028  < 2e-16 ***
## num_odor_complaints   3.373e-02  1.479e-03  22.803  < 2e-16 ***
## closest_wrp_capacity -3.793e-03  8.208e-04  -4.621 3.82e-06 ***
## hourly_downwind_ref  -4.602e-02  1.110e-02  -4.145 3.41e-05 ***
## hourly_downwind_wrp   4.670e-02  1.283e-02   3.640 0.000273 ***
## hourly_temp          -3.803e-02  6.784e-04 -56.060  < 2e-16 ***
## hourly_hum           -1.131e-02  2.406e-04 -46.995  < 2e-16 ***
## hourly_precip         1.131e+00  2.067e-01   5.469 4.54e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.659   8.00 35.07
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.260   8.26 96.92
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.974  80.00 96.24
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 118/122
## R-sq.(adj) =  0.175   Deviance explained = 17.5%
## GCV = 1.4048  Scale est. = 1.4037    n = 153718

Disaster Only

# Disaster only
summary(hm_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_max ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", 
##     k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##     k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + month + 
##     weekday + wd_avg + ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + 
##     I(1/dist_dc^2) + monthly_oil_2km + monthly_gas_2km + active_2km + 
##     inactive_2km + elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           7.751e+02  5.365e+02   1.445 0.148540    
## month11              -1.138e+01  1.069e+01  -1.065 0.286956    
## month12              -6.246e-01  1.157e+01  -0.054 0.956966    
## weekdayMon           -4.502e+00  5.492e+00  -0.820 0.412317    
## weekdayTue           -1.019e+01  5.510e+00  -1.850 0.064385 .  
## weekdayWed           -7.691e-01  5.439e+00  -0.141 0.887550    
## weekdayThu            5.550e+00  5.432e+00   1.022 0.306954    
## weekdayFri            9.091e+00  5.312e+00   1.711 0.087021 .  
## weekdaySat            7.882e+00  5.374e+00   1.467 0.142518    
## wd_avg               -7.400e-02  1.474e-02  -5.020 5.21e-07 ***
## ws_avg                3.090e+00  7.859e-01   3.932 8.45e-05 ***
## I(1/dist_wrp^2)      -1.485e-03  3.691e-04  -4.023 5.76e-05 ***
## I(1/dist_ref^2)      -3.664e-03  6.026e-03  -0.608 0.543217    
## I(1/dist_dc^2)        1.315e+01  3.620e+00   3.632 0.000282 ***
## monthly_oil_2km      -3.991e-02  1.803e-02  -2.213 0.026905 *  
## monthly_gas_2km      -6.194e-02  4.771e-02  -1.298 0.194215    
## active_2km           -2.612e+01  4.175e+00  -6.257 3.97e-10 ***
## inactive_2km          1.629e+02  1.456e+01  11.184  < 2e-16 ***
## elevation            -8.281e+01  3.893e+00 -21.274  < 2e-16 ***
## EVI                  -1.337e+03  9.018e+01 -14.826  < 2e-16 ***
## num_odor_complaints  -2.810e+00  1.361e-01 -20.642  < 2e-16 ***
## closest_wrp_capacity  2.611e+00  9.956e-01   2.623 0.008732 ** 
## hourly_downwind_wrp   1.212e+01  5.625e+00   2.155 0.031166 *  
## hourly_temp          -1.725e+00  2.936e-01  -5.876 4.25e-09 ***
## hourly_hum           -2.430e-01  8.825e-02  -2.754 0.005899 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.723  8.939 92.02
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 78.248 80.000 68.80
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 111/114
## R-sq.(adj) =  0.223   Deviance explained = 22.6%
## GCV =  62223  Scale est. = 61999     n = 30242

Exclude Disaster

# Exclude disaster
summary(hm_excl_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + active_2km + 
##     inactive_2km + elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_ref + hourly_downwind_wrp + hourly_temp + 
##     hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           7.511e+00  1.998e-01  37.594  < 2e-16 ***
## year2021              5.362e-02  2.429e-02   2.208   0.0273 *  
## year2022              7.902e-01  3.857e-02  20.485  < 2e-16 ***
## year2023              6.866e-01  3.720e-02  18.457  < 2e-16 ***
## weekdayMon            1.369e-01  8.297e-03  16.496  < 2e-16 ***
## weekdayTue            2.172e-01  8.229e-03  26.400  < 2e-16 ***
## weekdayWed            2.380e-01  8.221e-03  28.949  < 2e-16 ***
## weekdayThu            2.190e-01  8.219e-03  26.642  < 2e-16 ***
## weekdayFri            2.048e-01  8.222e-03  24.906  < 2e-16 ***
## weekdaySat            9.189e-02  8.207e-03  11.198  < 2e-16 ***
## wd_avg               -3.411e-04  2.608e-05 -13.080  < 2e-16 ***
## ws_avg               -4.306e-02  8.735e-04 -49.299  < 2e-16 ***
## I(1/dist_wrp^2)      -5.954e-01  2.387e-02 -24.946  < 2e-16 ***
## I(1/dist_ref^2)       5.514e+00  2.210e-01  24.946  < 2e-16 ***
## I(1/dist_dc^2)        4.001e+03  1.604e+02  24.946  < 2e-16 ***
## active_2km            1.363e-02  1.446e-03   9.429  < 2e-16 ***
## inactive_2km         -6.320e-02  4.472e-03 -14.132  < 2e-16 ***
## elevation            -7.581e-04  1.921e-03  -0.395   0.6931    
## EVI                  -1.173e+00  8.060e-02 -14.553  < 2e-16 ***
## num_odor_complaints   1.364e-02  6.396e-04  21.318  < 2e-16 ***
## closest_wrp_capacity -7.191e-03  4.534e-04 -15.858  < 2e-16 ***
## hourly_downwind_ref  -6.138e-02  7.893e-03  -7.776 7.53e-15 ***
## hourly_downwind_wrp   4.410e-02  8.714e-03   5.061 4.17e-07 ***
## hourly_temp          -3.956e-02  4.779e-04 -82.787  < 2e-16 ***
## hourly_hum           -1.116e-02  1.636e-04 -68.199  < 2e-16 ***
## hourly_precip        -5.181e-02  1.950e-01  -0.266   0.7905    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.965  8.000 242.2
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.945  8.992 253.8
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.659 80.000 187.1
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 121/123
## R-sq.(adj) =   0.15   Deviance explained =   15%
## GCV = 1.6295  Scale est. = 1.6289    n = 337596

Everything w Disaster Indicator

# Disaster indicator
summary(hm_dis_ind_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + 
##     monthly_gas_2km + active_2km + inactive_2km + elevation + 
##     EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum + disaster
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -3.670e+02  1.413e+01 -25.969  < 2e-16 ***
## year2021              2.018e+01  1.606e+00  12.568  < 2e-16 ***
## year2022              2.582e+01  2.078e+00  12.422  < 2e-16 ***
## year2023              1.517e+01  1.950e+00   7.781 7.20e-15 ***
## weekdayMon           -1.362e+00  4.926e-01  -2.765  0.00569 ** 
## weekdayTue           -1.285e+00  4.887e-01  -2.631  0.00853 ** 
## weekdayWed           -7.367e-01  4.881e-01  -1.509  0.13121    
## weekdayThu           -2.720e-01  4.879e-01  -0.557  0.57721    
## weekdayFri            4.210e-01  4.874e-01   0.864  0.38773    
## weekdaySat            3.651e-01  4.873e-01   0.749  0.45366    
## wd_avg               -1.679e-02  1.523e-03 -11.024  < 2e-16 ***
## ws_avg                3.779e-01  5.220e-02   7.241 4.46e-13 ***
## I(1/dist_wrp^2)       8.737e-06  8.794e-06   0.993  0.32047    
## I(1/dist_ref^2)      -7.063e-03  4.412e-04 -16.010  < 2e-16 ***
## I(1/dist_dc^2)        5.548e-01  2.627e-02  21.119  < 2e-16 ***
## monthly_oil_2km       8.650e-04  1.890e-04   4.576 4.74e-06 ***
## monthly_gas_2km       1.396e-04  4.037e-04   0.346  0.72950    
## active_2km            7.637e-01  9.040e-02   8.449  < 2e-16 ***
## inactive_2km          4.959e+00  2.540e-01  19.525  < 2e-16 ***
## elevation            -2.749e+00  1.138e-01 -24.162  < 2e-16 ***
## EVI                  -1.210e+02  4.664e+00 -25.946  < 2e-16 ***
## num_odor_complaints   1.268e+00  2.147e-02  59.044  < 2e-16 ***
## closest_wrp_capacity  9.131e-01  3.104e-02  29.419  < 2e-16 ***
## hourly_downwind_ref  -4.366e+00  4.604e-01  -9.484  < 2e-16 ***
## hourly_downwind_wrp   3.046e+00  5.165e-01   5.898 3.68e-09 ***
## hourly_temp          -4.631e-01  2.729e-02 -16.972  < 2e-16 ***
## hourly_hum           -1.129e-01  9.201e-03 -12.272  < 2e-16 ***
## disaster              2.610e+01  1.326e+00  19.679  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.933      8 46.65
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 98.34
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 78.203     80 82.06
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 122/125
## R-sq.(adj) =  0.0505   Deviance explained = 5.09%
## GCV = 6256.4  Scale est. = 6254.3    n = 367838

Everything w.o Disaster Indicator

# Everything
summary(hm_full_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + 
##     monthly_gas_2km + active_2km + inactive_2km + elevation + 
##     EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -3.333e+02  1.399e+01 -23.814  < 2e-16 ***
## year2021              6.996e-01  1.264e+00   0.554   0.5799    
## year2022              1.328e+00  1.667e+00   0.797   0.4255    
## year2023             -1.128e+00  1.757e+00  -0.642   0.5206    
## weekdayMon           -1.366e+00  4.929e-01  -2.771   0.0056 ** 
## weekdayTue           -1.250e+00  4.889e-01  -2.556   0.0106 *  
## weekdayWed           -7.087e-01  4.884e-01  -1.451   0.1467    
## weekdayThu           -2.809e-01  4.882e-01  -0.575   0.5651    
## weekdayFri            4.593e-01  4.877e-01   0.942   0.3463    
## weekdaySat            4.126e-01  4.875e-01   0.846   0.3974    
## wd_avg               -1.683e-02  1.526e-03 -11.031  < 2e-16 ***
## ws_avg                3.468e-01  5.259e-02   6.594 4.30e-11 ***
## I(1/dist_wrp^2)       1.139e-05  8.281e-06   1.375   0.1691    
## I(1/dist_ref^2)      -6.810e-03  4.205e-04 -16.196  < 2e-16 ***
## I(1/dist_dc^2)        5.209e-01  2.643e-02  19.704  < 2e-16 ***
## monthly_oil_2km       9.888e-04  1.878e-04   5.266 1.40e-07 ***
## monthly_gas_2km       5.603e-05  3.883e-04   0.144   0.8853    
## active_2km            5.029e-01  8.908e-02   5.645 1.65e-08 ***
## inactive_2km          5.486e+00  2.509e-01  21.870  < 2e-16 ***
## elevation            -2.897e+00  1.134e-01 -25.554  < 2e-16 ***
## EVI                  -1.179e+02  4.660e+00 -25.306  < 2e-16 ***
## num_odor_complaints   1.315e+00  2.135e-02  61.602  < 2e-16 ***
## closest_wrp_capacity  9.232e-01  3.095e-02  29.826  < 2e-16 ***
## hourly_downwind_ref  -4.424e+00  4.607e-01  -9.602  < 2e-16 ***
## hourly_downwind_wrp   3.119e+00  5.167e-01   6.036 1.58e-09 ***
## hourly_temp          -4.497e-01  2.734e-02 -16.450  < 2e-16 ***
## hourly_hum           -1.123e-01  9.283e-03 -12.096  < 2e-16 ***
## hourly_precip        -2.137e+00  1.067e+01  -0.200   0.8412    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.936      8 55.46
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 96.91
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 77.617     80 79.98
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 122/125
## R-sq.(adj) =  0.0495   Deviance explained = 4.99%
## GCV =   6263  Scale est. = 6260.9    n = 367838

Log Hourly Max

Since February 2022

# Since feb 2022
summary(log_hm_sincefeb2022_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + 
##     monthly_gas_2km + active_2km + inactive_2km + elevation + 
##     EVI + num_odor_complaints + hourly_downwind_ref + hourly_downwind_wrp + 
##     hourly_temp + hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.479e+00  6.855e-02  50.745  < 2e-16 ***
## year2023            -1.173e-01  1.715e-02  -6.840 7.92e-12 ***
## weekdayMon           1.137e-01  6.151e-03  18.479  < 2e-16 ***
## weekdayTue           1.806e-01  6.096e-03  29.626  < 2e-16 ***
## weekdayWed           1.744e-01  6.111e-03  28.536  < 2e-16 ***
## weekdayThu           1.292e-01  6.115e-03  21.126  < 2e-16 ***
## weekdayFri           1.427e-01  6.093e-03  23.423  < 2e-16 ***
## weekdaySat           8.645e-02  6.075e-03  14.229  < 2e-16 ***
## wd_avg              -4.281e-04  1.955e-05 -21.905  < 2e-16 ***
## ws_avg              -6.686e-02  7.642e-04 -87.498  < 2e-16 ***
## I(1/dist_wrp^2)      6.971e-07  7.730e-08   9.018  < 2e-16 ***
## I(1/dist_ref^2)      2.743e-07  1.763e-06   0.156 0.876375    
## I(1/dist_dc^2)      -5.415e-04  2.683e-05 -20.182  < 2e-16 ***
## monthly_oil_2km     -5.215e-06  2.388e-06  -2.184 0.028956 *  
## monthly_gas_2km     -3.444e-05  6.993e-06  -4.924 8.48e-07 ***
## active_2km           2.074e-02  1.913e-03  10.840  < 2e-16 ***
## inactive_2km        -1.233e-02  3.975e-03  -3.102 0.001923 ** 
## elevation           -3.940e-02  3.140e-03 -12.548  < 2e-16 ***
## EVI                 -2.347e+00  7.235e-02 -32.431  < 2e-16 ***
## num_odor_complaints  1.645e-02  7.961e-04  20.659  < 2e-16 ***
## hourly_downwind_ref -4.962e-03  5.975e-03  -0.831 0.406204    
## hourly_downwind_wrp  7.644e-02  6.905e-03  11.070  < 2e-16 ***
## hourly_temp         -3.153e-02  3.658e-04 -86.202  < 2e-16 ***
## hourly_hum          -1.023e-02  1.298e-04 -78.825  < 2e-16 ***
## hourly_precip        4.100e-01  1.112e-01   3.685 0.000228 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.832      8 113.5
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.999      9 223.6
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.996     80 658.9
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 119/122
## R-sq.(adj) =  0.495   Deviance explained = 49.6%
## GCV = 0.40679  Scale est. = 0.40648   n = 153718

Disaster Only

# Disaster only
summary(log_hm_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_max) ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp", 
##         "cc")) + month + weekday + wd_avg + ws_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km + 
##     inactive_2km + elevation + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_ref + hourly_downwind_wrp + hourly_temp + 
##     hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2.300e+01  2.000e+00 -11.498  < 2e-16 ***
## month11               1.587e-01  3.764e-02   4.216 2.50e-05 ***
## month12              -7.281e-02  3.887e-02  -1.873 0.061078 .  
## weekdayMon            5.560e-02  1.941e-02   2.864 0.004188 ** 
## weekdayTue           -7.745e-02  1.948e-02  -3.976 7.04e-05 ***
## weekdayWed            1.200e-01  1.923e-02   6.242 4.37e-10 ***
## weekdayThu            1.086e-01  1.940e-02   5.597 2.20e-08 ***
## weekdayFri            3.229e-02  1.879e-02   1.719 0.085638 .  
## weekdaySat           -7.210e-02  1.900e-02  -3.795 0.000148 ***
## wd_avg               -7.589e-04  5.254e-05 -14.443  < 2e-16 ***
## ws_avg               -1.126e-01  2.863e-03 -39.351  < 2e-16 ***
## I(1/dist_wrp^2)      -4.404e-05  3.692e-06 -11.929  < 2e-16 ***
## I(1/dist_ref^2)      -2.540e-04  2.117e-05 -11.999  < 2e-16 ***
## I(1/dist_dc^2)        3.175e-01  2.604e-02  12.191  < 2e-16 ***
## monthly_oil_2km       6.092e-04  4.319e-05  14.105  < 2e-16 ***
## active_2km           -4.152e-01  1.346e-02 -30.847  < 2e-16 ***
## inactive_2km          7.570e-01  4.627e-02  16.360  < 2e-16 ***
## elevation            -2.215e-01  1.232e-02 -17.975  < 2e-16 ***
## num_odor_complaints   4.079e-03  4.842e-04   8.424  < 2e-16 ***
## closest_wrp_capacity  6.850e-02  3.809e-03  17.985  < 2e-16 ***
## hourly_downwind_ref  -1.588e-01  1.574e-02 -10.090  < 2e-16 ***
## hourly_downwind_wrp   2.311e-01  2.005e-02  11.528  < 2e-16 ***
## hourly_temp          -3.327e-02  1.039e-03 -32.031  < 2e-16 ***
## hourly_hum           -9.715e-03  3.157e-04 -30.775  < 2e-16 ***
## hourly_precip         1.019e+00  2.722e-01   3.744 0.000181 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                           edf Ref.df      F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.00      9 238.98
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.36     80  89.27
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 111/114
## R-sq.(adj) =   0.56   Deviance explained = 56.2%
## GCV = 0.77762  Scale est. = 0.77478   n = 30242

Exclude Disaster

# Exclude disaster
summary(log_hm_excl_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + 
##     monthly_gas_2km + active_2km + inactive_2km + elevation + 
##     EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           4.634e+00  1.343e-01   34.500  < 2e-16 ***
## year2021              9.424e-02  1.318e-02    7.148 8.82e-13 ***
## year2022              5.780e-01  2.088e-02   27.687  < 2e-16 ***
## year2023              6.159e-01  2.018e-02   30.512  < 2e-16 ***
## weekdayMon            1.199e-01  4.493e-03   26.683  < 2e-16 ***
## weekdayTue            1.721e-01  4.455e-03   38.635  < 2e-16 ***
## weekdayWed            1.958e-01  4.451e-03   43.977  < 2e-16 ***
## weekdayThu            1.770e-01  4.451e-03   39.768  < 2e-16 ***
## weekdayFri            1.646e-01  4.452e-03   36.969  < 2e-16 ***
## weekdaySat            8.162e-02  4.444e-03   18.368  < 2e-16 ***
## wd_avg               -6.350e-04  1.412e-05  -44.959  < 2e-16 ***
## ws_avg               -4.240e-02  4.730e-04  -89.640  < 2e-16 ***
## I(1/dist_wrp^2)       1.299e-06  1.247e-07   10.419  < 2e-16 ***
## I(1/dist_ref^2)      -4.757e-05  5.521e-06   -8.617  < 2e-16 ***
## I(1/dist_dc^2)        1.379e-03  1.904e-04    7.240 4.49e-13 ***
## monthly_oil_2km      -1.082e-05  1.834e-06   -5.903 3.57e-09 ***
## monthly_gas_2km      -5.363e-05  3.621e-06  -14.812  < 2e-16 ***
## active_2km            2.563e-02  8.506e-04   30.131  < 2e-16 ***
## inactive_2km         -4.244e-02  2.468e-03  -17.200  < 2e-16 ***
## elevation             4.822e-03  1.032e-03    4.674 2.95e-06 ***
## EVI                  -2.087e+00  4.330e-02  -48.198  < 2e-16 ***
## num_odor_complaints   4.266e-03  3.464e-04   12.314  < 2e-16 ***
## closest_wrp_capacity -5.463e-03  2.970e-04  -18.395  < 2e-16 ***
## hourly_downwind_ref  -3.337e-02  4.271e-03   -7.814 5.55e-15 ***
## hourly_downwind_wrp   8.360e-02  4.722e-03   17.703  < 2e-16 ***
## hourly_temp          -3.081e-02  2.588e-04 -119.072  < 2e-16 ***
## hourly_hum           -8.804e-03  8.857e-05  -99.407  < 2e-16 ***
## hourly_precip        -6.942e-01  1.056e-01   -6.574 4.92e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     7.978      8  876.7
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 1206.9
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.979     80 1125.7
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 122/125
## R-sq.(adj) =  0.411   Deviance explained = 41.1%
## GCV = 0.47774  Scale est. = 0.47757   n = 337596

Everything w Disaster Indicator

# Disaster indicator
summary(log_hm_dis_ind_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + 
##     monthly_gas_2km + active_2km + inactive_2km + elevation + 
##     EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip + 
##     disaster
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           9.489e-01  1.333e-01    7.118 1.09e-12 ***
## year2021              8.889e-01  1.512e-02   58.793  < 2e-16 ***
## year2022              1.034e+00  1.960e-02   52.740  < 2e-16 ***
## year2023              9.078e-01  1.848e-02   49.122  < 2e-16 ***
## weekdayMon            1.078e-01  4.608e-03   23.389  < 2e-16 ***
## weekdayTue            1.440e-01  4.571e-03   31.503  < 2e-16 ***
## weekdayWed            1.841e-01  4.566e-03   40.324  < 2e-16 ***
## weekdayThu            1.651e-01  4.565e-03   36.163  < 2e-16 ***
## weekdayFri            1.471e-01  4.560e-03   32.271  < 2e-16 ***
## weekdaySat            6.604e-02  4.558e-03   14.491  < 2e-16 ***
## wd_avg               -7.062e-04  1.426e-05  -49.513  < 2e-16 ***
## ws_avg               -4.431e-02  4.920e-04  -90.057  < 2e-16 ***
## I(1/dist_wrp^2)       4.094e-07  1.043e-07    3.924 8.71e-05 ***
## I(1/dist_ref^2)      -7.222e-05  5.171e-06  -13.968  < 2e-16 ***
## I(1/dist_dc^2)        7.702e-03  2.318e-04   33.222  < 2e-16 ***
## monthly_oil_2km      -6.735e-06  1.786e-06   -3.772 0.000162 ***
## monthly_gas_2km      -2.692e-05  4.007e-06   -6.717 1.86e-11 ***
## active_2km            3.392e-02  8.637e-04   39.277  < 2e-16 ***
## inactive_2km         -1.671e-02  2.482e-03   -6.732 1.67e-11 ***
## elevation            -1.697e-02  1.067e-03  -15.901  < 2e-16 ***
## EVI                  -2.988e+00  4.368e-02  -68.400  < 2e-16 ***
## num_odor_complaints   1.256e-02  2.009e-04   62.519  < 2e-16 ***
## closest_wrp_capacity  1.663e-03  2.927e-04    5.681 1.34e-08 ***
## hourly_downwind_ref  -7.028e-02  4.308e-03  -16.314  < 2e-16 ***
## hourly_downwind_wrp   1.035e-01  4.832e-03   21.420  < 2e-16 ***
## hourly_temp          -3.352e-02  2.557e-04 -131.116  < 2e-16 ***
## hourly_hum           -9.410e-03  8.680e-05 -108.406  < 2e-16 ***
## hourly_precip        -7.807e-01  9.974e-02   -7.828 4.99e-15 ***
## disaster              8.926e-01  1.244e-02   71.741  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     7.967      8  497.7
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.995      9  595.7
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.896     80 1067.3
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 123/126
## R-sq.(adj) =  0.418   Deviance explained = 41.8%
## GCV = 0.54738  Scale est. = 0.5472    n = 367838

Everything w.o Disaster Indicator

# Everything
summary(log_hm_full_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + 
##     monthly_gas_2km + active_2km + inactive_2km + elevation + 
##     EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           2.054e+00  1.333e-01   15.405  < 2e-16 ***
## year2021              2.187e-01  1.197e-02   18.275  < 2e-16 ***
## year2022              1.921e-01  1.581e-02   12.152  < 2e-16 ***
## year2023              3.404e-01  1.682e-02   20.237  < 2e-16 ***
## weekdayMon            1.076e-01  4.640e-03   23.196  < 2e-16 ***
## weekdayTue            1.452e-01  4.603e-03   31.540  < 2e-16 ***
## weekdayWed            1.850e-01  4.597e-03   40.247  < 2e-16 ***
## weekdayThu            1.647e-01  4.596e-03   35.838  < 2e-16 ***
## weekdayFri            1.484e-01  4.591e-03   32.326  < 2e-16 ***
## weekdaySat            6.766e-02  4.589e-03   14.743  < 2e-16 ***
## wd_avg               -7.079e-04  1.436e-05  -49.287  < 2e-16 ***
## ws_avg               -4.538e-02  4.952e-04  -91.627  < 2e-16 ***
## I(1/dist_wrp^2)       7.347e-07  1.029e-07    7.138 9.47e-13 ***
## I(1/dist_ref^2)      -7.509e-05  5.212e-06  -14.406  < 2e-16 ***
## I(1/dist_dc^2)        6.627e-03  2.277e-04   29.104  < 2e-16 ***
## monthly_oil_2km      -3.143e-06  1.797e-06   -1.749   0.0803 .  
## monthly_gas_2km      -2.432e-05  4.037e-06   -6.024 1.70e-09 ***
## active_2km            2.440e-02  8.599e-04   28.379  < 2e-16 ***
## inactive_2km          3.587e-03  2.486e-03    1.443   0.1491    
## elevation            -2.241e-02  1.072e-03  -20.905  < 2e-16 ***
## EVI                  -2.888e+00  4.396e-02  -65.692  < 2e-16 ***
## num_odor_complaints   1.417e-02  2.010e-04   70.489  < 2e-16 ***
## closest_wrp_capacity  2.133e-03  2.947e-04    7.235 4.66e-13 ***
## hourly_downwind_ref  -7.222e-02  4.338e-03  -16.649  < 2e-16 ***
## hourly_downwind_wrp   1.058e-01  4.865e-03   21.751  < 2e-16 ***
## hourly_temp          -3.306e-02  2.574e-04 -128.472  < 2e-16 ***
## hourly_hum           -9.386e-03  8.740e-05 -107.398  < 2e-16 ***
## hourly_precip        -8.380e-01  1.004e-01   -8.344  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     7.937      8  601.5
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.994      9  511.8
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.929     80 1056.5
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 122/125
## R-sq.(adj) =   0.41   Deviance explained =   41%
## GCV = 0.55505  Scale est. = 0.55486   n = 367838

Helper Function

adj_r2 <- function(r2, n, p){
  # n-p since p here includes intercept
  return(1 - (1-r2)*(n - 1)/(n - p))
}

get_bt_adj_r2 <- function(name, response, daterange) {
  data <- get_data(response, daterange)
  model <- get(paste0(name,'_', daterange, '_gam'))
  response_colname <- names(model$model)[1]
  response_colname <- str_sub(response_colname, 5, -2)
  predictions <- predict(model, newdata = data) 
  bt_R2 <- R2(data %>% pull(response_colname), exp(predictions))
  bt_adj_r2 <- adj_r2(bt_R2, summary(model)$n, summary(model)$np)
  return(bt_adj_r2)
}

GAM result

# comp r2
data <- get_data('log(H2S_daily_avg)', 'dis_ind')
model <- get(paste0('log_da','_', 'dis_ind', '_gam'))
response_colname <- names(model$model)[1]
response_colname <- str_sub(response_colname, 5, -2)
predictions <- predict(model, newdata = data) 
bt_R2 <- R2(data %>% pull(response_colname), exp(predictions))

print('R2 computed using R2() function from caret package')
## [1] "R2 computed using R2() function from caret package"
R2(data %>% pull(response_colname) %>% log(), predictions)
## [1] 0.5621578
print('After adjusting for number of predictors')
## [1] "After adjusting for number of predictors"
adj_r2(R2(data %>% pull(response_colname) %>% log(), predictions),
       summary(model)$n, summary(model)$np)
## [1] 0.5587909
print('BT-R2 computed using R2() function from caret package')
## [1] "BT-R2 computed using R2() function from caret package"
R2(data %>% pull(response_colname), exp(predictions))
## [1] 0.0004870551
print('After adjusting for number of predictors')
## [1] "After adjusting for number of predictors"
adj_r2(R2(data %>% pull(response_colname), exp(predictions)),
       summary(model)$n, summary(model)$np)
## [1] -0.007199022
print('Returned by get_bt_adj_r2')
## [1] "Returned by get_bt_adj_r2"
get_bt_adj_r2('log_da', 'log(H2S_daily_avg)', 'dis_ind')
## [1] -0.007199022
print('Fit obs(y) ~ exp(predicted y)')
## [1] "Fit obs(y) ~ exp(predicted y)"
model_r2 <- lm(data %>% pull(response_colname) ~ exp(predictions))
summary(model_r2)$r.sq
## [1] 0.0004870551
summary(model_r2)$df[1] + summary(model_r2)$df[2]
## [1] 15595
summary(model_r2)$df[1]
## [1] 2
summary(model_r2)$adj.r.squared
## [1] 0.000422955
adj_r2(summary(model_r2)$r.sq,
       summary(model_r2)$df[1] + summary(model_r2)$df[2],
       summary(model_r2)$df[1])
## [1] 0.000422955
date_names <- c('Since Feb 2022', 'Disaster Only', 'Exclude Disaster',
                'Everything w. Disaster Indicator', 
                'Everything w.o Disaster Indicator')
response_disp_names <- c('Daily Avg', 'Log Daily Avg', 'Daily Max', 'Log Daily Max',
                    'Hourly Avg', 'Log Hourly Avg', 'Hourly Max', 'Log Hourly Max')

gam_result_table <- expand.grid(date_names, response_disp_names) %>%
  setNames(c('date_names', 'response_disp_names'))

date_name_conversion <- tibble(date_names = date_names,
                               daterange = dateranges)

response_name_conversion <- tibble(response_disp_names = unique(gam_result_table$response_disp_names),
                                   response_obj_name = c(daily_responses, hourly_responses),
                                   model_response_name = response_names,
                                   transformation = rep(c('', 'Log'), 4))

gam_result_table <- gam_result_table %>%
  left_join(date_name_conversion) %>%
  left_join(response_name_conversion)
## Joining with `by = join_by(date_names)`
## Joining with `by = join_by(response_disp_names)`
gam_result_table <- gam_result_table %>%
  mutate(adjr2 = NA, 
         bt_adjr2 = NA,
         n = NA,
         p = NA)

for (i in 1:nrow(gam_result_table)) {
  name <- gam_result_table$model_response_name[i]
  response <- gam_result_table$response_obj_name[i]
  daterange <- gam_result_table$daterange[i]
  model <- get(paste0(name,'_', daterange, '_gam'))
  if (str_detect(response, 'log\\(')) {
    bt_adjr2 <- get_bt_adj_r2(name, response, daterange)
  } else {
    bt_adjr2 <- NA
  }
  
  adjr2 <- summary(model)$r.sq
  n <- summary(model)$n
  p <- summary(model)$np
  
  new_columns <- tibble(adjr2 = adjr2, bt_adjr2 = bt_adjr2, n = n, p = p)
  new_row <- bind_cols(gam_result_table[i, 1:6], new_columns)
  gam_result_table[i, ] <- new_row
  print(str_glue('Completed {i} iterations'))
}
## Completed 1 iterations
## Completed 2 iterations
## Completed 3 iterations
## Completed 4 iterations
## Completed 5 iterations
## Completed 6 iterations
## Completed 7 iterations
## Completed 8 iterations
## Completed 9 iterations
## Completed 10 iterations
## Completed 11 iterations
## Completed 12 iterations
## Completed 13 iterations
## Completed 14 iterations
## Completed 15 iterations
## Completed 16 iterations
## Completed 17 iterations
## Completed 18 iterations
## Completed 19 iterations
## Completed 20 iterations
## Completed 21 iterations
## Completed 22 iterations
## Completed 23 iterations
## Completed 24 iterations
## Completed 25 iterations
## Completed 26 iterations
## Completed 27 iterations
## Completed 28 iterations
## Completed 29 iterations
## Completed 30 iterations
## Completed 31 iterations
## Completed 32 iterations
## Completed 33 iterations
## Completed 34 iterations
## Completed 35 iterations
## Completed 36 iterations
## Completed 37 iterations
## Completed 38 iterations
## Completed 39 iterations
## Completed 40 iterations
temp <- rep(rep(response_disp_names[!str_detect(response_disp_names, 'Log')], each =2), 5)

base_table <- gam_result_table %>%
  arrange(factor(date_names, levels = .env$date_names)) %>%
  mutate(response_base = temp,
       `bt_adjr2` = '') %>%
  filter(transformation == '') %>%
  select(all_of(c('date_names', 'response_base', 'model_response_name', 'adjr2', 'bt_adjr2', 'n', 'p'))) %>%
  select(-bt_adjr2)

log_table <- gam_result_table %>%
  arrange(factor(date_names, levels = .env$date_names)) %>%
  mutate(response_base = temp) %>%
  filter(transformation == 'Log') %>%
  select(all_of(c('date_names', 'response_base', 'model_response_name','adjr2', 'bt_adjr2', 'n', 'p')))

gam_result_table_fordisp <- base_table %>%
  left_join(log_table, join_by(date_names, response_base)) %>%
  select(-'date_names', -starts_with('model_response_name')) %>%
  setNames(c('Response', c('Adj.R2', 'N', 'P'), c('Adj.R2', 'BT-Adj.R2', 'N', 'P')))

gam_result_table_kable <- gam_result_table_fordisp %>%
  knitr::kable(format = 'latex', digits = 2) %>%
  pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4, 
                      "Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
  add_header_above(c(' ' = 1, 'No Transformation' = 3, 'Log-Transformation' = 4))

writeLines(gam_result_table_kable, '../figures/gam_result_table.tex')

gam_result_table_fordisp %>%
  knitr::kable(format = 'html', digits = 2, table.attr = "style='width:100%;'") %>%
  pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4, 
                      "Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
  add_header_above(c(' ' = 1, 'No Transformation' = 3, 'Log-Transformation' = 4))
No Transformation
Log-Transformation
Response Adj.R2 N P Adj.R2 BT-Adj.R2 N P
Since Feb 2022
Daily Avg 0.62 6531 117 0.70 0.61 6531 116
Daily Max 0.22 6531 116 0.54 0.22 6531 115
Hourly Avg 0.35 153718 123 0.55 0.34 153718 122
Hourly Max 0.17 153718 122 0.50 0.16 153718 122
Disaster Only
Daily Avg 0.42 1273 105 0.75 0.55 1273 111
Daily Max 0.43 1273 99 0.72 0.44 1273 110
Hourly Avg 0.21 30242 114 0.58 0.22 30242 114
Hourly Max 0.22 30242 114 0.56 0.24 30242 114
Exclude Disaster
Daily Avg 0.46 14322 120 0.57 0.46 14322 121
Daily Max 0.12 14322 115 0.48 0.11 14322 120
Hourly Avg 0.25 337596 124 0.44 0.25 337596 125
Hourly Max 0.15 337596 123 0.41 0.14 337596 125
Everything w D.I
Daily Avg 0.11 15595 115 0.56 -0.01 15595 120
Daily Max 0.12 15595 115 0.51 -0.01 15595 120
Hourly Avg 0.05 367838 126 0.44 0.00 367838 125
Hourly Max 0.05 367838 125 0.42 0.00 367838 126
Everything w.o D.I
Daily Avg 0.11 15595 114 0.54 -0.01 15595 119
Daily Max 0.12 15595 115 0.49 -0.01 15595 120
Hourly Avg 0.05 367838 125 0.44 0.00 367838 124
Hourly Max 0.05 367838 125 0.41 0.00 367838 125

XGBoost (Daily Average)

Since 2022 Feb

validation_result <- tibble(Model = character(),
                               model_response_name = character(),
                               daterange = character(),
                               'Coef' = character(),
                               'R-Sq' = numeric(),
                               'Disaster RMSE' = numeric(),
                               'Normal RMSE' = numeric())

xgb_result <- tibble(Model = character(),
                        model_response_name = character(),
                        daterange = character(),
                        'R-Sq' = numeric(),
                        'BT R-Sq' = numeric(),
                        'RMSE' = numeric(),
                        'BT RMSE' = numeric())

fit.xgb_da_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_sincefeb2022.rds')
getTrainPerf(fit.xgb_da_sincefeb2022)
fit.xgb_da_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.6 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41 
## niter: 700
## nfeatures : 41 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 141     700         6 0.1  0.01              0.5                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
names <- c("Longitude" = "mon_utm_x", 
           "Latitude" = "mon_utm_y",
           "Distance to Refinery" = "dist_ref", 
           "Angle to Refinery" = "angle_ref",
           "Active Wells within 2km" = "active_2km", 
           "Inactive Wells within 2km" = "inactive_2km",
           "Monthly Oil Production 2km" = "monthly_oil_2km",
           "Monthly Gas Production 2km" = "monthly_gas_2km",
           "Distance to WRP" = "dist_wrp",
           "WRP Capacity" = "closest_wrp_capacity",
           "Angle to WRP" = "angle_wrp",
           "Distance to Dominguez Channel" = "dist_dc",
           "Average Daily Temperature" = "daily_temp",
           "Average Daily Humidity" = "daily_hum",
           "Daily Precipitation" = "daily_precip",
           "Average Daily Wind Speed" = "ws_avg",
           "Average Daily Wind Direction" = "wd_avg",
           "Downwind Refinery" = "daily_downwind_ref",
           "Downwind WRP" = "daily_downwind_wrp",
           "Elevation" = "elevation",
           "Enhanced Vegetation Index" = "EVI",
           "Number of Daily Odor Complaints" = "num_odor_complaints",
           "2020" = "year_2020",
           "2021" = "year_2021",
           "2022" = "year_2022",
           "2023" = "year_2023",
           "January" = "month_01",
           "February" = "month_02",
           "March" = "month_03",
           "April" = "month_04",
           "May" = "month_05",
           "June" = "month_06", 
           "July" = "month_07",
           "August" = "month_08",
           "September" = "month_09",
           "October" = "month_10",
           "November" = "month_11",
           "December" = "month_12",
           "Monday" = "weekday_Mon",
           "Tuesday" = "weekday_Tue",
           "Wednesday" = "weekday_Wed",
           "Thursday" = "weekday_Thu",
           "Friday" = "weekday_Fri",
           "Saturday" = "weekday_Sat",
           "Sunday" = "weekday_Sun",
           "Odor Event" = "disaster")

imp<-varImp(fit.xgb_da_sincefeb2022,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# we use savePredictions = 'final' to store the predictions on the test set at each fold

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_sincefeb2022$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_da_sincefeb2022$pred$obs, pred = fit.xgb_da_sincefeb2022$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Since 2022 XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                              tibble(Model = 'Since Feb 2022',
                                     model_response_name = 'da',
                                     daterange = 'sincefeb2022',
                                     'Coef' = summary(lm(fit.xgb_da_sincefeb2022$pred$obs ~
                                                        fit.xgb_da_sincefeb2022$pred$pred))$coefficients[2, 1], 
                                     'R-Sq' = summary(lm(fit.xgb_da_sincefeb2022$pred$obs ~
                                                        fit.xgb_da_sincefeb2022$pred$pred))$r.squared,
                                     'Disaster RMSE' = NA,
                                     'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_da_sincefeb2022$trainingData),
                       fit.xgb_da_sincefeb2022$finalModel$nfeatures)

xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Since Feb 2022',
                        model_response_name = 'da',
                        daterange = 'sincefeb2022',
                        'R-Sq' = test_adj_r2,
                        'BT R-Sq' = NA,
                        'RMSE' = test_rmse,
                        'BT RMSE' = NA))

Disaster

fit.xgb_da_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_dis.rds')
getTrainPerf(fit.xgb_da_dis)
fit.xgb_da_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 868.8 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 300
## nfeatures : 31 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 121     300         6 0.1 0.001             0.75                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

fold_stat <- fit.xgb_da_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_da_dis$pred$obs, pred = fit.xgb_da_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        # geom_abline(intercept = 0, slope = 1, color = 'red') +
                        # geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Disaster Only XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = fit.xgb_da_dis$pred$obs, pred = fit.xgb_da_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  model_response_name = 'da',
                                  daterange = 'dis',
                                  'Coef' = summary(lm(fit.xgb_da_dis$pred$obs ~
                                                        fit.xgb_da_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_da_dis$pred$obs ~
                                                        fit.xgb_da_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_da_dis$trainingData),
                       fit.xgb_da_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Disaster Only',
                              model_response_name = 'da',
                              daterange = 'dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Exclude Disaster

fit.xgb_da_excl_dis<- readRDS('../rfiles/xgboost_v2/fit.xgb_da_excl_dis.rds')
getTrainPerf(fit.xgb_da_excl_dis)
fit.xgb_da_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.7 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 700
## nfeatures : 43 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 150     700         6 0.1  0.01             0.75                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

fold_stat <- fit.xgb_da_excl_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_da_excl_dis$pred$obs, 
                                               pred = fit.xgb_da_excl_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for exclude disaster XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_excl_dis_obs_vs_pred_plot

validation_result <- rbind(validation_result,  
                           tibble(Model = 'Exclude Disaster',
                                  model_response_name = 'da',
                                  daterange = 'excl_dis',
                                  'Coef' = summary(lm(fit.xgb_da_excl_dis$pred$obs ~
                                                        fit.xgb_da_excl_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_da_excl_dis$pred$obs ~
                                                        fit.xgb_da_excl_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_da_excl_dis$trainingData), 
                       fit.xgb_da_excl_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Exclude Disaster',
                              model_response_name = 'da',
                              daterange = 'excl_dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Everything w Disaster Indicator

fit.xgb_da_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_dis_ind.rds')
getTrainPerf(fit.xgb_da_dis_ind)
fit.xgb_da_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.1 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.5", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44 
## niter: 700
## nfeatures : 44 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 21     700         4 0.1 0.001                1                0       0.5
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_light()

ggsave('../figures/dis_ind_da_fi.png', width = 20, height = 12, units = "cm")
test_result_data <- tibble(obs = fit.xgb_da_dis_ind$pred$obs, 
                           pred = fit.xgb_da_dis_ind$pred$pred,
                           disaster = fit.xgb_da_dis_ind$trainingData$disaster[fit.xgb_da_dis_ind$pred$rowIndex])

fold_stat <- fit.xgb_da_dis_ind$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
da_dis_ind_obs_vs_pred_plot <- 
  ggplot(tibble(obs = test_result_data$obs, 
                pred = test_result_data$pred,
                disaster = test_result_data$disaster),
         aes(x = pred, y = obs)) +
  geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
  geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +    
  labs(y = 'Observed', x = 'Predicted', title = '') +
  stat_poly_line() +
  stat_poly_eq(use_label(c("eq")), label.x = "left", label.y = 0.95) +
  stat_poly_eq(use_label(c("R2")), label.x = "left", label.y = 0.91) +        
  stat_poly_eq(use_label(c("n")), label.x = "left", label.y = 0.85) +
  theme_bw()

da_dis_ind_obs_vs_pred_plot_zoom <- 
  ggplot(tibble(obs = test_result_data$obs, 
                pred = test_result_data$pred,
                disaster = test_result_data$disaster),
         aes(x = pred, y = obs)) +
  geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
  geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +    
  labs(y = 'Observed', x = 'Predicted', title = '') +
  stat_poly_line() +
  stat_poly_eq(use_label(c("eq")), label.x = "left", label.y = 0.95) +
  stat_poly_eq(use_label(c("R2")), label.x = "left", label.y = 0.91) +        
  stat_poly_eq(use_label(c("n")), label.x = "left", label.y = 0.85) +
  coord_cartesian(xlim = c(0, 10), ylim = c(0, 10)) +
  theme_bw()
ggarrange(da_dis_ind_obs_vs_pred_plot, da_dis_ind_obs_vs_pred_plot_zoom,
          labels = c("1", "2"), nrow = 1)

ggsave('../figures/dis_ind_da_ovp.png', width = 22, height = 12, units = "cm")
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w. Disaster Indicator',
                                  model_response_name = 'da',
                                  daterange = 'dis_ind',
                                  'Coef' = summary(lm(fit.xgb_da_dis_ind$pred$obs ~
                                                        fit.xgb_da_dis_ind$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_da_dis_ind$pred$obs ~
                                                        fit.xgb_da_dis_ind$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                         test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_da_dis_ind$trainingData), 
                       fit.xgb_da_dis_ind$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w. Disaster Indicator',
                              model_response_name = 'da',
                              daterange = 'dis_ind',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Everything w.o Disaster Indicator

fit.xgb_da_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_full.rds')
getTrainPerf(fit.xgb_da_full)
fit.xgb_da_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.6 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "5", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 700
## nfeatures : 43 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 243     700         5 0.3 0.001                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_da_full$pred$obs, 
                           pred = fit.xgb_da_full$pred$pred,
                           disaster = if_else(fit.xgb_da_full$trainingData[fit.xgb_da_full$pred$rowIndex, ]$year_2021 == 1 &
                                                (fit.xgb_da_full$trainingData[fit.xgb_da_full$pred$rowIndex, ]$month_10 == 1 |
                                                 fit.xgb_da_full$trainingData[fit.xgb_da_full$pred$rowIndex, ]$month_11 == 1 |
                                                 fit.xgb_da_full$trainingData[fit.xgb_da_full$pred$rowIndex, ]$month_12 == 1), 1, 0))

fold_stat <- fit.xgb_da_full$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_everything_obs_vs_pred_plot <- ggplot(test_result_data,
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for everything XGBoost') +
                        theme_bw()

xgb_everything_obs_vs_pred_plot_zoom <- ggplot(test_result_data,
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
                        theme_bw()
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  model_response_name = 'da',
                                  daterange = 'full',
                                  'Coef' = summary(lm(test_result_data$obs ~
                                                        test_result_data$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test_result_data$obs ~
                                                        test_result_data$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                         test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_da_full$trainingData),
                       fit.xgb_da_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Everything w.o Disaster Indicator',
                              model_response_name = 'da',
                              daterange = 'full',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))
knitr::kable(validation_result, digits = 3)
Model model_response_name daterange Coef R-Sq Disaster RMSE Normal RMSE
Since Feb 2022 da sincefeb2022 1.012 0.759 NA NA
Disaster Only da dis 1.172 0.642 NA NA
Exclude Disaster da excl_dis 1.004 0.673 NA NA
Everything w. Disaster Indicator da dis_ind 1.063 0.970 15.828 0.477
Everything w.o Disaster Indicator da full 1.049 0.989 9.919 0.137

XGBoost: log(H2S_daily_avg)

Since Feb 2022

fit.xgb_da_log_h2s_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_sincefeb2022.rds')
getTrainPerf(fit.xgb_da_log_h2s_sincefeb2022)
fit.xgb_da_log_h2s_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.7 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41 
## niter: 700
## nfeatures : 41 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 123     700         6 0.1 0.001             0.75                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_sincefeb2022,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_sincefeb2022$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_sincefeb2022$pred$obs), 
                                                              pred = exp(fit.xgb_da_log_h2s_sincefeb2022$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Since Februrary 2022') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.05) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.15) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Since Feb 2022',
                                  model_response_name = 'log_da',
                                  daterange = 'sincefeb2022',
                                  'Coef' = summary(lm(fit.xgb_da_log_h2s_sincefeb2022$pred$obs ~
                                                        fit.xgb_da_log_h2s_sincefeb2022$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_da_log_h2s_sincefeb2022$pred$obs ~
                                                        fit.xgb_da_log_h2s_sincefeb2022$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_da_log_h2s_sincefeb2022$trainingData), 
                       fit.xgb_da_log_h2s_sincefeb2022$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_da_log_h2s_sincefeb2022$trainingData), 
                       fit.xgb_da_log_h2s_sincefeb2022$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Since Feb 2022',
                              model_response_name = 'log_da',
                              daterange = 'sincefeb2022',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Disaster Only

fit.xgb_da_log_h2s_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_dis.rds')
getTrainPerf(fit.xgb_da_log_h2s_dis)
fit.xgb_da_log_h2s_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.6 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "5", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 700
## nfeatures : 31 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 69     700         5 0.1 0.001             0.75                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_dis,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_dis$pred$obs), 
                                                              pred = exp(fit.xgb_da_log_h2s_dis$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Disaster Only') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.18) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  model_response_name = 'log_da',
                                  daterange = 'dis',
                                  'Coef' = summary(lm(fit.xgb_da_log_h2s_dis$pred$obs ~
                                                        fit.xgb_da_log_h2s_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_da_log_h2s_dis$pred$obs ~
                                                        fit.xgb_da_log_h2s_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_da_log_h2s_dis$trainingData), 
                       fit.xgb_da_log_h2s_dis$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_da_log_h2s_dis$trainingData), 
                       fit.xgb_da_log_h2s_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Disaster Only',
                              model_response_name = 'log_da',
                              daterange = 'dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Exclude Disaster

fit.xgb_da_log_h2s_excl_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_excl_dis.rds')
getTrainPerf(fit.xgb_da_log_h2s_excl_dis)
fit.xgb_da_log_h2s_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.7 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 700
## nfeatures : 43 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 123     700         6 0.1 0.001             0.75                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_excl_dis,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_excl_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_excl_dis$pred$obs), 
                                                              pred = exp(fit.xgb_da_log_h2s_excl_dis$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Exclude Disaster') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Exclude Disaster',
                                  model_response_name = 'log_da',
                                  daterange = 'excl_dis',
                                  'Coef' = summary(lm(fit.xgb_da_log_h2s_excl_dis$pred$obs ~
                                                        fit.xgb_da_log_h2s_excl_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_da_log_h2s_excl_dis$pred$obs ~
                                                        fit.xgb_da_log_h2s_excl_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                      nrow(fit.xgb_da_log_h2s_excl_dis$trainingData), 
                      fit.xgb_da_log_h2s_excl_dis$finalModel$nfeatures)


BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_da_log_h2s_excl_dis$trainingData), 
                       fit.xgb_da_log_h2s_excl_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Exclude Disaster',
                              model_response_name = 'log_da',
                              daterange = 'excl_dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Everything w. Disaster Indicator

fit.xgb_da_log_h2s_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_dis_ind.rds')
getTrainPerf(fit.xgb_da_log_h2s_dis_ind)
fit.xgb_da_log_h2s_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.8 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44 
## niter: 700
## nfeatures : 44 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 285     700         6 0.3 0.001             0.75                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_dis_ind,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_da_log_h2s_dis_ind$pred$obs, 
                           pred = fit.xgb_da_log_h2s_dis_ind$pred$pred,
                           disaster = fit.xgb_da_log_h2s_dis_ind$trainingData$disaster[fit.xgb_da_log_h2s_dis_ind$pred$rowIndex])
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_dis_ind$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_dis_ind$pred$obs), 
                                                              pred = exp(fit.xgb_da_log_h2s_dis_ind$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w. Disaster Indicator') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) + 
                        stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.08) + 
                        stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.17) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Everything w. Disaster Indicator',
                                  model_response_name = 'log_da',
                                  daterange = 'dis_ind',
                                  'Coef' = summary(lm(fit.xgb_da_log_h2s_dis_ind$pred$obs ~
                                                        fit.xgb_da_log_h2s_dis_ind$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_da_log_h2s_dis_ind$pred$obs ~
                                                        fit.xgb_da_log_h2s_dis_ind$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                       test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_da_log_h2s_dis_ind$trainingData), 
                       fit.xgb_da_log_h2s_dis_ind$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_da_log_h2s_dis_ind$trainingData), 
                       fit.xgb_da_log_h2s_dis_ind$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                    tibble(Model = 'Everything w. Disaster Indicator',
                              model_response_name = 'log_da',
                              daterange = 'dis_ind',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Everything w.o Disaster Indicator

fit.xgb_da_log_h2s_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_full.rds')
getTrainPerf(fit.xgb_da_log_h2s_full)
fit.xgb_da_log_h2s_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.8 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 700
## nfeatures : 43 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 285     700         6 0.3 0.001             0.75                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_full,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_da_log_h2s_full$pred$obs, 
                           pred = fit.xgb_da_log_h2s_full$pred$pred,
                           disaster = if_else(fit.xgb_da_log_h2s_full$trainingData[fit.xgb_da_log_h2s_full$pred$rowIndex, ]$year_2021 == 1 &
                                                (fit.xgb_da_log_h2s_full$trainingData[fit.xgb_da_log_h2s_full$pred$rowIndex, ]$month_10 == 1 |
                                                 fit.xgb_da_log_h2s_full$trainingData[fit.xgb_da_log_h2s_full$pred$rowIndex, ]$month_11 == 1 |
                                                 fit.xgb_da_log_h2s_full$trainingData[fit.xgb_da_log_h2s_full$pred$rowIndex, ]$month_12 == 1), 1, 0))

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_full$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_full_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_full$pred$obs), 
                                                              pred = exp(fit.xgb_da_log_h2s_full$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w.o Disaster Indicator') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) + 
                        stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 
                        stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  model_response_name = 'log_da',
                                  daterange = 'full',
                                  'Coef' = summary(lm(fit.xgb_da_log_h2s_full$pred$obs ~
                                                        fit.xgb_da_log_h2s_full$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_da_log_h2s_full$pred$obs ~
                                                        fit.xgb_da_log_h2s_full$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                       test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_da_log_h2s_full$trainingData), 
                       fit.xgb_da_log_h2s_full$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt, 
                       nrow(fit.xgb_da_log_h2s_full$trainingData), 
                       fit.xgb_da_log_h2s_full$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w.o Disaster Indicator',
                              model_response_name = 'log_da',
                              daterange = 'full',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

XGBoost (Daily Max)

Since 2022 Feb

fit.xgb_dm_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_sincefeb2022.rds')
getTrainPerf(fit.xgb_dm_sincefeb2022)
fit.xgb_dm_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "0.75", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41 
## niter: 300
## nfeatures : 41 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 151     300         6 0.1  0.01             0.75                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_sincefeb2022,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# we use savePredictions = 'final' to store the predictions on the test set at each fold

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_sincefeb2022$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_dm_sincefeb2022$pred$obs, pred = fit.xgb_dm_sincefeb2022$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Since 2022 XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Since Feb 2022',
                                  model_response_name = 'dm',
                                  daterange = 'sincefeb2022',
                                  'Coef' = summary(lm(fit.xgb_dm_sincefeb2022$pred$obs ~
                                                        fit.xgb_dm_sincefeb2022$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_sincefeb2022$pred$obs ~
                                                        fit.xgb_dm_sincefeb2022$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_dm_sincefeb2022$trainingData),
                       fit.xgb_dm_sincefeb2022$finalModel$nfeatures)

xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Since Feb 2022',
                              model_response_name = 'dm',
                              daterange = 'sincefeb2022',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Disaster

fit.xgb_dm_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_dis.rds')
getTrainPerf(fit.xgb_dm_dis)
fit.xgb_dm_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 459.6 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "4", gamma = "0.01", colsample_bytree = "0.75", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 300
## nfeatures : 31 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 43     300         4 0.1  0.01             0.75                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

fold_stat <- fit.xgb_dm_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_dm_dis$pred$obs, pred = fit.xgb_dm_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        # geom_abline(intercept = 0, slope = 1, color = 'red') +
                        # geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Disaster Only XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = fit.xgb_dm_dis$pred$obs, pred = fit.xgb_dm_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  model_response_name = 'dm',
                                  daterange = 'dis',
                                  'Coef' = summary(lm(fit.xgb_dm_dis$pred$obs ~
                                                        fit.xgb_dm_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_dis$pred$obs ~
                                                        fit.xgb_dm_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_dm_dis$trainingData),
                       fit.xgb_dm_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Disaster Only',
                              model_response_name = 'dm',
                              daterange = 'dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Exclude Disaster

fit.xgb_dm_excl_dis<- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_excl_dis.rds')
getTrainPerf(fit.xgb_dm_excl_dis)
fit.xgb_dm_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.1 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 300
## nfeatures : 43 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 142     300         6 0.1  0.01              0.5                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

fold_stat <- fit.xgb_dm_excl_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_dm_excl_dis$pred$obs, 
                                               pred = fit.xgb_dm_excl_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for exclude disaster XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_excl_dis_obs_vs_pred_plot

validation_result <- rbind(validation_result,  
                           tibble(Model = 'Exclude Disaster',
                                  model_response_name = 'dm',
                                  daterange = 'excl_dis',
                                  'Coef' = summary(lm(fit.xgb_dm_excl_dis$pred$obs ~
                                                        fit.xgb_dm_excl_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_excl_dis$pred$obs ~
                                                        fit.xgb_dm_excl_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_dm_excl_dis$trainingData), 
                       fit.xgb_dm_excl_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Exclude Disaster',
                              model_response_name = 'dm',
                              daterange = 'excl_dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Everything w Disaster Indicator

fit.xgb_dm_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_dis_ind.rds')
getTrainPerf(fit.xgb_dm_dis_ind)
fit.xgb_dm_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.6 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "6", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44 
## niter: 700
## nfeatures : 44 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 294     700         6 0.3 0.001                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_light()

ggsave('../figures/dis_ind_dm_fi.png', width = 20, height = 12, units = "cm")
test_result_data <- tibble(obs = fit.xgb_dm_dis_ind$pred$obs, 
                           pred = fit.xgb_dm_dis_ind$pred$pred,
                           disaster = fit.xgb_dm_dis_ind$trainingData$disaster[fit.xgb_dm_dis_ind$pred$rowIndex])

fold_stat <- fit.xgb_dm_dis_ind$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
dm_dis_ind_obs_vs_pred_plot <- 
  ggplot(tibble(obs = test_result_data$obs, 
                pred = test_result_data$pred,
                disaster = test_result_data$disaster),
         aes(x = pred, y = obs)) +
  geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
  geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +    
  labs(y = 'Observed', x = 'Predicted', title = '') +
  stat_poly_line() +
  stat_poly_eq(use_label(c("eq")), label.x = "left", label.y = 0.95) +
  stat_poly_eq(use_label(c("R2")), label.x = "left", label.y = 0.91) +        
  stat_poly_eq(use_label(c("n")), label.x = "left", label.y = 0.85) +
  theme_bw()

dm_dis_ind_obs_vs_pred_plot_zoom <- 
  ggplot(tibble(obs = test_result_data$obs, 
                pred = test_result_data$pred,
                disaster = test_result_data$disaster),
         aes(x = pred, y = obs)) +
  geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
  geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +    
  labs(y = 'Observed', x = 'Predicted', title = '') +
  stat_poly_line() +
  stat_poly_eq(use_label(c("eq")), label.x = "left", label.y = 0.95) +
  stat_poly_eq(use_label(c("R2")), label.x = "left", label.y = 0.91) +        
  stat_poly_eq(use_label(c("n")), label.x = "left", label.y = 0.85) +
  coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
  theme_bw()
ggarrange(dm_dis_ind_obs_vs_pred_plot, dm_dis_ind_obs_vs_pred_plot_zoom,
          labels = c("1", "2"), nrow = 1)

ggsave('../figures/dis_ind_dm_ovp.png', width = 22, height = 12, units = "cm")
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w. Disaster Indicator',
                                  model_response_name = 'dm',
                                  daterange = 'dis_ind',
                                  'Coef' = summary(lm(fit.xgb_dm_dis_ind$pred$obs ~
                                                        fit.xgb_dm_dis_ind$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_dis_ind$pred$obs ~
                                                        fit.xgb_dm_dis_ind$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                         test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_dm_dis_ind$trainingData), 
                       fit.xgb_dm_dis_ind$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w. Disaster Indicator',
                              model_response_name = 'dm',
                              daterange = 'dis_ind',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Everything w.o Disaster Indicator

fit.xgb_dm_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_full.rds')
getTrainPerf(fit.xgb_dm_full)
fit.xgb_dm_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.6 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "5", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 700
## nfeatures : 43 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 231     700         5 0.3 0.001             0.75                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_dm_full$pred$obs, 
                           pred = fit.xgb_dm_full$pred$pred,
                           disaster = if_else(fit.xgb_dm_full$trainingData[fit.xgb_dm_full$pred$rowIndex, ]$year_2021 == 1 &
                                                (fit.xgb_dm_full$trainingData[fit.xgb_dm_full$pred$rowIndex, ]$month_10 == 1 |
                                                 fit.xgb_dm_full$trainingData[fit.xgb_dm_full$pred$rowIndex, ]$month_11 == 1 |
                                                 fit.xgb_dm_full$trainingData[fit.xgb_dm_full$pred$rowIndex, ]$month_12 == 1), 1, 0))

fold_stat <- fit.xgb_dm_full$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_everything_obs_vs_pred_plot <- ggplot(test_result_data,
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for everything XGBoost') +
                        theme_bw()

xgb_everything_obs_vs_pred_plot_zoom <- ggplot(test_result_data,
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
                        theme_bw()
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  model_response_name = 'dm',
                                  daterange = 'full',
                                  'Coef' = summary(lm(test_result_data$obs ~
                                                        test_result_data$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test_result_data$obs ~
                                                        test_result_data$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                         test_result_data$obs[which(test_result_data$disaster == 0)])))
train_adj_r2 <- adj_r2(getTrainPerf(fit.xgb_dm_full)$TrainRsquared,
                       nrow(fit.xgb_dm_full$trainingData),
                       fit.xgb_dm_full$finalModel$nfeatures)
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_dm_full$trainingData),
                       fit.xgb_dm_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Everything w.o Disaster Indicator',
                              model_response_name = 'dm',
                              daterange = 'full',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

XGBoost: log(H2S_daily_max)

Since Feb 2022

fit.xgb_dm_log_h2s_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_sincefeb2022.rds')
getTrainPerf(fit.xgb_dm_log_h2s_sincefeb2022)
fit.xgb_dm_log_h2s_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41 
## niter: 300
## nfeatures : 41 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 133     300         6 0.1 0.001                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_sincefeb2022,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_sincefeb2022$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_sincefeb2022$pred$obs), 
                                                              pred = exp(fit.xgb_dm_log_h2s_sincefeb2022$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Since Februrary 2022') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.05) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.15) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Since Feb 2022',
                                  model_response_name = 'log_dm',
                                  daterange = 'sincefeb2022',
                                  'Coef' = summary(lm(fit.xgb_dm_log_h2s_sincefeb2022$pred$obs ~
                                                        fit.xgb_dm_log_h2s_sincefeb2022$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_sincefeb2022$pred$obs ~
                                                        fit.xgb_dm_log_h2s_sincefeb2022$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_dm_log_h2s_sincefeb2022$trainingData), 
                       fit.xgb_dm_log_h2s_sincefeb2022$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_dm_log_h2s_sincefeb2022$trainingData), 
                       fit.xgb_dm_log_h2s_sincefeb2022$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Since Feb 2022',
                              model_response_name = 'log_dm',
                              daterange = 'sincefeb2022',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Disaster Only

fit.xgb_dm_log_h2s_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_dis.rds')
getTrainPerf(fit.xgb_dm_log_h2s_dis)
fit.xgb_dm_log_h2s_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 793.6 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "4", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 500
## nfeatures : 31 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 32     500         4 0.1  0.01              0.5                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_dis,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_dis$pred$obs), 
                                                              pred = exp(fit.xgb_dm_log_h2s_dis$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Disaster Only') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.18) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  model_response_name = 'log_dm',
                                  daterange = 'dis',
                                  'Coef' = summary(lm(fit.xgb_dm_log_h2s_dis$pred$obs ~
                                                        fit.xgb_dm_log_h2s_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_dis$pred$obs ~
                                                        fit.xgb_dm_log_h2s_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_dm_log_h2s_dis$trainingData), 
                       fit.xgb_dm_log_h2s_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_dm_log_h2s_dis$trainingData), 
                       fit.xgb_dm_log_h2s_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Disaster Only',
                              model_response_name = 'log_dm',
                              daterange = 'dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Exclude Disaster

fit.xgb_dm_log_h2s_excl_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_excl_dis.rds')
getTrainPerf(fit.xgb_dm_log_h2s_excl_dis)
fit.xgb_dm_log_h2s_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 500
## nfeatures : 43 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 158     500         6 0.1  0.01                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_excl_dis,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_excl_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_excl_dis$pred$obs), 
                                                              pred = exp(fit.xgb_dm_log_h2s_excl_dis$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Exclude Disaster') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Exclude Disaster',
                                  model_response_name = 'log_dm',
                                  daterange = 'excl_dis',
                                  'Coef' = summary(lm(fit.xgb_dm_log_h2s_excl_dis$pred$obs ~
                                                        fit.xgb_dm_log_h2s_excl_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_excl_dis$pred$obs ~
                                                        fit.xgb_dm_log_h2s_excl_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_dm_log_h2s_excl_dis$trainingData), 
                       fit.xgb_dm_log_h2s_excl_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_dm_log_h2s_excl_dis$trainingData), 
                       fit.xgb_dm_log_h2s_excl_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Exclude Disaster',
                              model_response_name = 'log_dm',
                              daterange = 'excl_dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Everything w. Disaster Indicator

fit.xgb_dm_log_h2s_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_dis_ind.rds')
getTrainPerf(fit.xgb_dm_log_h2s_dis_ind)
fit.xgb_dm_log_h2s_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.8 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "6", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44 
## niter: 700
## nfeatures : 44 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 321     700         6 0.3  0.01                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_dis_ind,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_dm_log_h2s_dis_ind$pred$obs, 
                           pred = fit.xgb_dm_log_h2s_dis_ind$pred$pred,
                           disaster = fit.xgb_dm_log_h2s_dis_ind$trainingData$disaster[fit.xgb_dm_log_h2s_dis_ind$pred$rowIndex])

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_dis_ind$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_dis_ind$pred$obs), 
                                                              pred = exp(fit.xgb_dm_log_h2s_dis_ind$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w. Disaster Indicator') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) + 
                        stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.08) + 
                        stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.17) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Everything w. Disaster Indicator',
                                  model_response_name = 'log_dm',
                                  daterange = 'dis_ind',
                                  'Coef' = summary(lm(fit.xgb_dm_log_h2s_dis_ind$pred$obs ~
                                                        fit.xgb_dm_log_h2s_dis_ind$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_dis_ind$pred$obs ~
                                                        fit.xgb_dm_log_h2s_dis_ind$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                       test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_dm_log_h2s_dis_ind$trainingData), 
                       fit.xgb_dm_log_h2s_dis_ind$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_dm_log_h2s_dis_ind$trainingData), 
                       fit.xgb_dm_log_h2s_dis_ind$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w. Disaster Indicator',
                              model_response_name = 'log_dm',
                              daterange = 'dis_ind',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Everything w.o Disaster Indicator

fit.xgb_dm_log_h2s_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_full.rds')
getTrainPerf(fit.xgb_dm_log_h2s_full)
fit.xgb_dm_log_h2s_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.8 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "6", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 700
## nfeatures : 43 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 321     700         6 0.3  0.01                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_full,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_dm_log_h2s_full$pred$obs, 
                           pred = fit.xgb_dm_log_h2s_full$pred$pred,
                           disaster = if_else(fit.xgb_dm_log_h2s_full$trainingData[fit.xgb_dm_log_h2s_full$pred$rowIndex, ]$year_2021 == 1 &
                                                (fit.xgb_dm_log_h2s_full$trainingData[fit.xgb_dm_log_h2s_full$pred$rowIndex, ]$month_10 == 1 |
                                                   fit.xgb_dm_log_h2s_full$trainingData[fit.xgb_dm_log_h2s_full$pred$rowIndex, ]$month_11 == 1 |
                                                   fit.xgb_dm_log_h2s_full$trainingData[fit.xgb_dm_log_h2s_full$pred$rowIndex, ]$month_12 == 1), 1, 0))

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_full$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_full_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_full$pred$obs), 
                                                              pred = exp(fit.xgb_dm_log_h2s_full$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w.o Disaster Indicator') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) + 
                        stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 
                        stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  model_response_name = 'log_dm',
                                  daterange = 'full',
                                  'Coef' = summary(lm(fit.xgb_dm_log_h2s_full$pred$obs ~
                                                        fit.xgb_dm_log_h2s_full$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_full$pred$obs ~
                                                        fit.xgb_dm_log_h2s_full$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                       test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_dm_log_h2s_full$trainingData), 
                       fit.xgb_dm_log_h2s_full$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt, 
                       nrow(fit.xgb_dm_log_h2s_full$trainingData), 
                       fit.xgb_dm_log_h2s_full$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w.o Disaster Indicator',
                              model_response_name = 'log_dm',
                              daterange = 'full',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

XGBoost (Hourly Average)

Since 2022 Feb

fit.xgb_ha_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_sincefeb2022.rds')
getTrainPerf(fit.xgb_ha_sincefeb2022)
fit.xgb_ha_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 251.7 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41 
## niter: 150
## nfeatures : 41 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 54     150         4 0.4 0.001                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
names <- c("Longitude" = "mon_utm_x", 
           "Latitude" = "mon_utm_y",
           "Distance to Refinery" = "dist_ref", 
           "Angle to Refinery" = "angle_ref",
           "Active Wells within 2km" = "active_2km", 
           "Inactive Wells within 2km" = "inactive_2km",
           "Monthly Oil Production 2km" = "monthly_oil_2km",
           "Monthly Gas Production 2km" = "monthly_gas_2km",
           "Distance to WRP" = "dist_wrp",
           "WRP Capacity" = "closest_wrp_capacity",
           "Angle to WRP" = "angle_wrp",
           "Distance to Dominguez Channel" = "dist_dc",
           "Hourly Temperature" = "hourly_temp",
           "Hourly Humidity" = "hourly_hum",
           "Hourly Precipitation" = "hourly_precip",
           "Hourly Wind Speed" = "ws_avg",
           "Hourly Wind Direction" = "wd_avg",
           "Downwind Refinery" = "hourly_downwind_ref",
           "Downwind WRP" = "hourly_downwind_wrp",
           "Elevation" = "elevation",
           "Enhanced Vegetation Index" = "EVI",
           "Number of Daily Odor Complaints" = "num_odor_complaints",
           "2020" = "year_2020",
           "2021" = "year_2021",
           "2022" = "year_2022",
           "2023" = "year_2023",
           "January" = "month_01",
           "February" = "month_02",
           "March" = "month_03",
           "April" = "month_04",
           "May" = "month_05",
           "June" = "month_06", 
           "July" = "month_07",
           "August" = "month_08",
           "September" = "month_09",
           "October" = "month_10",
           "November" = "month_11",
           "December" = "month_12",
           "Monday" = "weekday_Mon",
           "Tuesday" = "weekday_Tue",
           "Wednesday" = "weekday_Wed",
           "Thursday" = "weekday_Thu",
           "Friday" = "weekday_Fri",
           "Saturday" = "weekday_Sat",
           "Sunday" = "weekday_Sun",
           "Odor Event" = "disaster")

imp<-varImp(fit.xgb_ha_sincefeb2022,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# we use savePredictions = 'final' to store the predictions on the test set at each fold

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_ha_sincefeb2022$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_ha_sincefeb2022$pred$obs, pred = fit.xgb_ha_sincefeb2022$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Since 2022 XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                              tibble(Model = 'Since Feb 2022',
                                     model_response_name = 'ha',
                                     daterange = 'sincefeb2022',
                                     'Coef' = summary(lm(fit.xgb_ha_sincefeb2022$pred$obs ~
                                                        fit.xgb_ha_sincefeb2022$pred$pred))$coefficients[2, 1], 
                                     'R-Sq' = summary(lm(fit.xgb_ha_sincefeb2022$pred$obs ~
                                                        fit.xgb_ha_sincefeb2022$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_ha_sincefeb2022$trainingData),
                       fit.xgb_ha_sincefeb2022$finalModel$nfeatures)

xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Since Feb 2022',
                              model_response_name = 'ha',
                              daterange = 'sincefeb2022',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Disaster

fit.xgb_ha_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_dis.rds')
getTrainPerf(fit.xgb_ha_dis)
fit.xgb_ha_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 84.9 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "0.8", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 50
## nfeatures : 31 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 51      50         4 0.4 0.001              0.8                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_ha_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

fold_stat <- fit.xgb_ha_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_ha_dis$pred$obs, pred = fit.xgb_ha_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        # geom_abline(intercept = 0, slope = 1, color = 'red') +
                        # geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Disaster Only XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = fit.xgb_ha_dis$pred$obs, pred = fit.xgb_ha_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  model_response_name = 'ha',
                                  daterange = 'dis',
                                  'Coef' = summary(lm(fit.xgb_ha_dis$pred$obs ~
                                                        fit.xgb_ha_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_ha_dis$pred$obs ~
                                                        fit.xgb_ha_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_ha_dis$trainingData),
                       fit.xgb_ha_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Disaster Only',
                              model_response_name = 'ha',
                              daterange = 'dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Exclude Disaster

fit.xgb_ha_excl_dis<- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_excl_dis.rds')
getTrainPerf(fit.xgb_ha_excl_dis)
fit.xgb_ha_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.1 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "0.8", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 150
## nfeatures : 43 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 52     150         4 0.4 0.001              0.8                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_ha_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

fold_stat <- fit.xgb_ha_excl_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_ha_excl_dis$pred$obs, 
                                               pred = fit.xgb_ha_excl_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for exclude disaster XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_excl_dis_obs_vs_pred_plot

validation_result <- rbind(validation_result,  
                           tibble(Model = 'Exclude Disaster',
                                  model_response_name = 'ha',
                                  daterange = 'excl_dis',
                                  'Coef' = summary(lm(fit.xgb_ha_excl_dis$pred$obs ~
                                                        fit.xgb_ha_excl_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_ha_excl_dis$pred$obs ~
                                                        fit.xgb_ha_excl_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_ha_excl_dis$trainingData), 
                       fit.xgb_ha_excl_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Exclude Disaster',
                              model_response_name = 'ha',
                              daterange = 'excl_dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Everything w Disaster Indicator

fit.xgb_ha_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_dis_ind.rds')
getTrainPerf(fit.xgb_ha_dis_ind)
fit.xgb_ha_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 249.3 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44 
## niter: 150
## nfeatures : 44 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 62     150         4 0.4  0.01                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_ha_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_light()

ggsave('../figures/dis_ind_ha_fi.png', width = 20, height = 12, units = "cm")
test_result_data <- tibble(obs = fit.xgb_ha_dis_ind$pred$obs, 
                           pred = fit.xgb_ha_dis_ind$pred$pred,
                           disaster = fit.xgb_ha_dis_ind$trainingData$disaster[fit.xgb_ha_dis_ind$pred$rowIndex])

fold_stat <- fit.xgb_ha_dis_ind$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
ha_dis_ind_obs_vs_pred_plot <- 
  ggplot(tibble(obs = test_result_data$obs, 
                pred = test_result_data$pred,
                disaster = test_result_data$disaster),
         aes(x = pred, y = obs)) +
  geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
  geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +    
  labs(y = 'Observed', x = 'Predicted', title = '') +
  stat_poly_line() +
  stat_poly_eq(use_label(c("eq")), label.x = "left", label.y = 0.95) +
  stat_poly_eq(use_label(c("R2")), label.x = "left", label.y = 0.91) +        
  stat_poly_eq(use_label(c("n")), label.x = "left", label.y = 0.85) +
  theme_bw()

ha_dis_ind_obs_vs_pred_plot_zoom <- 
  ggplot(tibble(obs = test_result_data$obs, 
                pred = test_result_data$pred,
                disaster = test_result_data$disaster),
         aes(x = pred, y = obs)) +
  geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
  geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +    
  labs(y = 'Observed', x = 'Predicted', title = '') +
  stat_poly_line() +
  stat_poly_eq(use_label(c("eq")), label.x = "left", label.y = 0.95) +
  stat_poly_eq(use_label(c("R2")), label.x = "left", label.y = 0.91) +        
  stat_poly_eq(use_label(c("n")), label.x = "left", label.y = 0.85) +
  coord_cartesian(xlim = c(0, 20), ylim = c(0, 20)) +
  theme_bw()
ggarrange(ha_dis_ind_obs_vs_pred_plot, ha_dis_ind_obs_vs_pred_plot_zoom,
          labels = c("1", "2"), nrow = 1)

ggsave('../figures/dis_ind_ha_ovp.png', width = 22, height = 12, units = "cm")
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w. Disaster Indicator',
                                  model_response_name = 'ha',
                                  daterange = 'dis_ind',
                                  'Coef' = summary(lm(fit.xgb_ha_dis_ind$pred$obs ~
                                                        fit.xgb_ha_dis_ind$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_ha_dis_ind$pred$obs ~
                                                        fit.xgb_ha_dis_ind$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                         test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_ha_dis_ind$trainingData), 
                       fit.xgb_ha_dis_ind$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w. Disaster Indicator',
                              model_response_name = 'ha',
                              daterange = 'dis_ind',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Everything w.o Disaster Indicator

fit.xgb_ha_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_full.rds')
getTrainPerf(fit.xgb_ha_full)
fit.xgb_ha_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 249 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 150
## nfeatures : 43 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 54     150         4 0.4 0.001                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_ha_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_ha_full$pred$obs, 
                           pred = fit.xgb_ha_full$pred$pred,
                           disaster = if_else(fit.xgb_ha_full$trainingData[fit.xgb_ha_full$pred$rowIndex, ]$year_2021 == 1 &
                                                (fit.xgb_ha_full$trainingData[fit.xgb_ha_full$pred$rowIndex, ]$month_10 == 1 |
                                                 fit.xgb_ha_full$trainingData[fit.xgb_ha_full$pred$rowIndex, ]$month_11 == 1 |
                                                 fit.xgb_ha_full$trainingData[fit.xgb_ha_full$pred$rowIndex, ]$month_12 == 1), 1, 0))

fold_stat <- fit.xgb_ha_full$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_everything_obs_vs_pred_plot <- ggplot(test_result_data,
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for everything XGBoost') +
                        theme_bw()

xgb_everything_obs_vs_pred_plot_zoom <- ggplot(test_result_data,
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
                        theme_bw()
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  model_response_name = 'ha',
                                  daterange = 'full',
                                  'Coef' = summary(lm(test_result_data$obs ~
                                                        test_result_data$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test_result_data$obs ~
                                                        test_result_data$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                         test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_ha_full$trainingData),
                       fit.xgb_ha_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Everything w.o Disaster Indicator',
                              model_response_name = 'ha',
                              daterange = 'full',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

XGBoost: log(H2S_hourly_avg)

Since Feb 2022

fit.xgb_log_ha_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_sincefeb2022.rds')
getTrainPerf(fit.xgb_log_ha_sincefeb2022)
fit.xgb_log_ha_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 252.8 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41 
## niter: 150
## nfeatures : 41 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 56     150         4 0.4 0.001                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_sincefeb2022,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_sincefeb2022$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_sincefeb2022$pred$obs), 
                                                              pred = exp(fit.xgb_log_ha_sincefeb2022$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Since Februrary 2022') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.05) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.15) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Since Feb 2022',
                                  model_response_name = 'log_ha',
                                  daterange = 'sincefeb2022',
                                  'Coef' = summary(lm(fit.xgb_log_ha_sincefeb2022$pred$obs ~
                                                        fit.xgb_log_ha_sincefeb2022$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_ha_sincefeb2022$pred$obs ~
                                                        fit.xgb_log_ha_sincefeb2022$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_ha_sincefeb2022$trainingData), 
                       fit.xgb_log_ha_sincefeb2022$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_log_ha_sincefeb2022$trainingData), 
                       fit.xgb_log_ha_sincefeb2022$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Since Feb 2022',
                              model_response_name = 'log_ha',
                              daterange = 'sincefeb2022',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Disaster Only

fit.xgb_log_ha_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_dis.rds')
getTrainPerf(fit.xgb_log_ha_dis)
fit.xgb_log_ha_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 250.3 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 150
## nfeatures : 31 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 56     150         4 0.4 0.001                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_dis,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_dis$pred$obs), 
                                                              pred = exp(fit.xgb_log_ha_dis$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Disaster Only') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.18) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  model_response_name = 'log_ha',
                                  daterange = 'dis',
                                  'Coef' = summary(lm(fit.xgb_log_ha_dis$pred$obs ~
                                                        fit.xgb_log_ha_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_ha_dis$pred$obs ~
                                                        fit.xgb_log_ha_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_ha_dis$trainingData), 
                       fit.xgb_log_ha_dis$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_log_ha_dis$trainingData), 
                       fit.xgb_log_ha_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Disaster Only',
                              model_response_name = 'log_ha',
                              daterange = 'dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Exclude Disaster

fit.xgb_log_ha_excl_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_excl_dis.rds')
getTrainPerf(fit.xgb_log_ha_excl_dis)
fit.xgb_log_ha_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.9 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 150
## nfeatures : 43 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64     150         4 0.4  0.01                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_excl_dis,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_excl_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_excl_dis$pred$obs), 
                                                              pred = exp(fit.xgb_log_ha_excl_dis$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Exclude Disaster') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Exclude Disaster',
                                  model_response_name = 'log_ha',
                                  daterange = 'excl_dis',
                                  'Coef' = summary(lm(fit.xgb_log_ha_excl_dis$pred$obs ~
                                                        fit.xgb_log_ha_excl_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_ha_excl_dis$pred$obs ~
                                                        fit.xgb_log_ha_excl_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_ha_excl_dis$trainingData), 
                       fit.xgb_log_ha_excl_dis$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_log_ha_excl_dis$trainingData), 
                       fit.xgb_log_ha_excl_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Exclude Disaster',
                              model_response_name = 'log_ha',
                              daterange = 'excl_dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Everything w. Disaster Indicator

fit.xgb_log_ha_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_dis_ind.rds')
getTrainPerf(fit.xgb_log_ha_dis_ind)
fit.xgb_log_ha_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 254 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44 
## niter: 150
## nfeatures : 44 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64     150         4 0.4  0.01                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_dis_ind,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_log_ha_dis_ind$pred$obs, 
                           pred = fit.xgb_log_ha_dis_ind$pred$pred,
                           disaster = fit.xgb_log_ha_dis_ind$trainingData$disaster[fit.xgb_log_ha_dis_ind$pred$rowIndex])

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_dis_ind$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_dis_ind$pred$obs), 
                                                              pred = exp(fit.xgb_log_ha_dis_ind$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w. Disaster Indicator') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) + 
                        stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.08) + 
                        stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.17) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Everything w. Disaster Indicator',
                                  model_response_name = 'log_ha',
                                  daterange = 'dis_ind',
                                  'Coef' = summary(lm(fit.xgb_log_ha_dis_ind$pred$obs ~
                                                        fit.xgb_log_ha_dis_ind$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_ha_dis_ind$pred$obs ~
                                                        fit.xgb_log_ha_dis_ind$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                       test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_ha_dis_ind$trainingData), 
                       fit.xgb_log_ha_dis_ind$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_log_ha_dis_ind$trainingData), 
                       fit.xgb_log_ha_dis_ind$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                    tibble(Model = 'Everything w. Disaster Indicator',
                              model_response_name = 'log_ha',
                              daterange = 'dis_ind',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Everything w.o Disaster Indicator

fit.xgb_log_ha_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_full.rds')
getTrainPerf(fit.xgb_log_ha_full)
fit.xgb_log_ha_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.9 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 150
## nfeatures : 43 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 62     150         4 0.4  0.01                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_full,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_log_ha_full$pred$obs, 
                           pred = fit.xgb_log_ha_full$pred$pred,
                           disaster = if_else(fit.xgb_log_ha_full$trainingData[fit.xgb_log_ha_full$pred$rowIndex, ]$year_2021 == 1 &
                                                (fit.xgb_log_ha_full$trainingData[fit.xgb_log_ha_full$pred$rowIndex, ]$month_10 == 1 |
                                                   fit.xgb_log_ha_full$trainingData[fit.xgb_log_ha_full$pred$rowIndex, ]$month_11 == 1 |
                                                   fit.xgb_log_ha_full$trainingData[fit.xgb_log_ha_full$pred$rowIndex, ]$month_12 == 1), 1, 0))

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_full$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_full_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_full$pred$obs), 
                                                              pred = exp(fit.xgb_log_ha_full$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w.o Disaster Indicator') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) + 
                        stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 
                        stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  model_response_name = 'log_ha',
                                  daterange = 'full',
                                  'Coef' = summary(lm(fit.xgb_log_ha_full$pred$obs ~
                                                        fit.xgb_log_ha_full$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_ha_full$pred$obs ~
                                                        fit.xgb_log_ha_full$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                       test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_ha_full$trainingData), 
                       fit.xgb_log_ha_full$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt, 
                       nrow(fit.xgb_log_ha_full$trainingData), 
                       fit.xgb_log_ha_full$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w.o Disaster Indicator',
                              model_response_name = 'log_ha',
                              daterange = 'full',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

XGBoost (Hourly Max)

Since 2022 Feb

fit.xgb_hm_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_sincefeb2022.rds')
getTrainPerf(fit.xgb_hm_sincefeb2022)
fit.xgb_hm_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 251.8 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.2", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41 
## niter: 150
## nfeatures : 41 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 30     150         4 0.2  0.01                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_sincefeb2022,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# we use savePredictions = 'final' to store the predictions on the test set at each fold

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_hm_sincefeb2022$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_hm_sincefeb2022$pred$obs, pred = fit.xgb_hm_sincefeb2022$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Since 2022 XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Since Feb 2022',
                                  model_response_name = 'hm',
                                  daterange = 'sincefeb2022',
                                  'Coef' = summary(lm(fit.xgb_hm_sincefeb2022$pred$obs ~
                                                        fit.xgb_hm_sincefeb2022$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_hm_sincefeb2022$pred$obs ~
                                                        fit.xgb_hm_sincefeb2022$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_hm_sincefeb2022$trainingData),
                       fit.xgb_hm_sincefeb2022$finalModel$nfeatures)

xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Since Feb 2022',
                              model_response_name = 'hm',
                              daterange = 'sincefeb2022',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Disaster

fit.xgb_hm_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_dis.rds')
getTrainPerf(fit.xgb_hm_dis)
fit.xgb_hm_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 86.2 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.2", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 50
## nfeatures : 31 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 29      50         4 0.2  0.01                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

fold_stat <- fit.xgb_hm_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_hm_dis$pred$obs, pred = fit.xgb_hm_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        # geom_abline(intercept = 0, slope = 1, color = 'red') +
                        # geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Disaster Only XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = fit.xgb_hm_dis$pred$obs, pred = fit.xgb_hm_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  model_response_name = 'hm',
                                  daterange = 'dis',
                                  'Coef' = summary(lm(fit.xgb_hm_dis$pred$obs ~
                                                        fit.xgb_hm_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_hm_dis$pred$obs ~
                                                        fit.xgb_hm_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_hm_dis$trainingData),
                       fit.xgb_hm_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Disaster Only',
                              model_response_name = 'hm',
                              daterange = 'dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Exclude Disaster

fit.xgb_hm_excl_dis<- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_excl_dis.rds')
getTrainPerf(fit.xgb_hm_excl_dis)
fit.xgb_hm_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 252.7 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "0.8", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 150
## nfeatures : 43 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 60     150         4 0.4  0.01              0.8                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

fold_stat <- fit.xgb_hm_excl_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_hm_excl_dis$pred$obs, 
                                               pred = fit.xgb_hm_excl_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for exclude disaster XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_excl_dis_obs_vs_pred_plot

validation_result <- rbind(validation_result,  
                           tibble(Model = 'Exclude Disaster',
                                  model_response_name = 'hm',
                                  daterange = 'excl_dis',
                                  'Coef' = summary(lm(fit.xgb_hm_excl_dis$pred$obs ~
                                                        fit.xgb_hm_excl_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_hm_excl_dis$pred$obs ~
                                                        fit.xgb_hm_excl_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_hm_excl_dis$trainingData), 
                       fit.xgb_hm_excl_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Exclude Disaster',
                              model_response_name = 'hm',
                              daterange = 'excl_dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Everything w Disaster Indicator

fit.xgb_hm_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_dis_ind.rds')
getTrainPerf(fit.xgb_hm_dis_ind)
fit.xgb_hm_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 249.9 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44 
## niter: 150
## nfeatures : 44 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 54     150         4 0.4 0.001                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_light()

ggsave('../figures/dis_ind_hm_fi.png', width = 20, height = 12, units = "cm")
test_result_data <- tibble(obs = fit.xgb_hm_dis_ind$pred$obs, 
                           pred = fit.xgb_hm_dis_ind$pred$pred,
                           disaster = fit.xgb_hm_dis_ind$trainingData$disaster[fit.xgb_hm_dis_ind$pred$rowIndex])

fold_stat <- fit.xgb_hm_dis_ind$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
hm_dis_ind_obs_vs_pred_plot <- 
  ggplot(tibble(obs = test_result_data$obs, 
                pred = test_result_data$pred,
                disaster = test_result_data$disaster),
         aes(x = pred, y = obs)) +
  geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
  geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +    
  labs(y = 'Observed', x = 'Predicted', title = '') +
  stat_poly_line() +
  stat_poly_eq(use_label(c("eq")), label.x = "left", label.y = 0.95) +
  stat_poly_eq(use_label(c("R2")), label.x = "left", label.y = 0.91) +        
  stat_poly_eq(use_label(c("n")), label.x = "left", label.y = 0.85) +
  theme_bw()

hm_dis_ind_obs_vs_pred_plot_zoom <- 
  ggplot(tibble(obs = test_result_data$obs, 
                pred = test_result_data$pred,
                disaster = test_result_data$disaster),
         aes(x = pred, y = obs)) +
  geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
  geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +    
  labs(y = 'Observed', x = 'Predicted', title = '') +
  stat_poly_line() +
  stat_poly_eq(use_label(c("eq")), label.x = "left", label.y = 0.95) +
  stat_poly_eq(use_label(c("R2")), label.x = "left", label.y = 0.91) +        
  stat_poly_eq(use_label(c("n")), label.x = "left", label.y = 0.85) +
  coord_cartesian(xlim = c(0, 100), ylim = c(0, 100)) +
  theme_bw()
ggarrange(hm_dis_ind_obs_vs_pred_plot, hm_dis_ind_obs_vs_pred_plot_zoom,
          labels = c("1", "2"), nrow = 1)

ggsave('../figures/dis_ind_hm_ovp.png', width = 22, height = 12, units = "cm")
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w. Disaster Indicator',
                                  model_response_name = 'hm',
                                  daterange = 'dis_ind',
                                  'Coef' = summary(lm(fit.xgb_hm_dis_ind$pred$obs ~
                                                        fit.xgb_hm_dis_ind$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_hm_dis_ind$pred$obs ~
                                                        fit.xgb_hm_dis_ind$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                         test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_hm_dis_ind$trainingData), 
                       fit.xgb_hm_dis_ind$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w. Disaster Indicator',
                              model_response_name = 'hm',
                              daterange = 'dis_ind',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Everything w.o Disaster Indicator

fit.xgb_hm_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_full.rds')
getTrainPerf(fit.xgb_hm_full)
fit.xgb_hm_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 248.7 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 150
## nfeatures : 43 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 54     150         4 0.4 0.001                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_hm_full$pred$obs, 
                           pred = fit.xgb_hm_full$pred$pred,
                           disaster = if_else(fit.xgb_hm_full$trainingData[fit.xgb_hm_full$pred$rowIndex, ]$year_2021 == 1 &
                                                (fit.xgb_hm_full$trainingData[fit.xgb_hm_full$pred$rowIndex, ]$month_10 == 1 |
                                                 fit.xgb_hm_full$trainingData[fit.xgb_hm_full$pred$rowIndex, ]$month_11 == 1 |
                                                 fit.xgb_hm_full$trainingData[fit.xgb_hm_full$pred$rowIndex, ]$month_12 == 1), 1, 0))

fold_stat <- fit.xgb_hm_full$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_everything_obs_vs_pred_plot <- ggplot(test_result_data,
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for everything XGBoost') +
                        theme_bw()

xgb_everything_obs_vs_pred_plot_zoom <- ggplot(test_result_data,
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
                        theme_bw()
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  model_response_name = 'hm',
                                  daterange = 'full',
                                  'Coef' = summary(lm(test_result_data$obs ~
                                                        test_result_data$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test_result_data$obs ~
                                                        test_result_data$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                         test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_hm_full$trainingData),
                       fit.xgb_hm_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Everything w.o Disaster Indicator',
                              model_response_name = 'hm',
                              daterange = 'full',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

XGBoost: log(H2S_hourly_max)

Since Feb 2022

fit.xgb_log_hm_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_sincefeb2022.rds')
getTrainPerf(fit.xgb_log_hm_sincefeb2022)
fit.xgb_log_hm_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 252.8 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41 
## niter: 150
## nfeatures : 41 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 56     150         4 0.4 0.001                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_sincefeb2022,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_sincefeb2022$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_sincefeb2022$pred$obs), 
                                                              pred = exp(fit.xgb_log_hm_sincefeb2022$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Since Februrary 2022') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.05) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.15) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Since Feb 2022',
                                  model_response_name = 'log_hm',
                                  daterange = 'sincefeb2022',
                                  'Coef' = summary(lm(fit.xgb_log_hm_sincefeb2022$pred$obs ~
                                                        fit.xgb_log_hm_sincefeb2022$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_hm_sincefeb2022$pred$obs ~
                                                        fit.xgb_log_hm_sincefeb2022$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_hm_sincefeb2022$trainingData), 
                       fit.xgb_log_hm_sincefeb2022$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_log_hm_sincefeb2022$trainingData), 
                       fit.xgb_log_hm_sincefeb2022$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Since Feb 2022',
                              model_response_name = 'log_hm',
                              daterange = 'sincefeb2022',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Disaster Only

fit.xgb_log_hm_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_dis.rds')
getTrainPerf(fit.xgb_log_hm_dis)
fit.xgb_log_hm_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 249.9 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 150
## nfeatures : 31 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64     150         4 0.4  0.01                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_dis,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_dis$pred$obs), 
                                                              pred = exp(fit.xgb_log_hm_dis$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Disaster Only') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.18) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  model_response_name = 'log_hm',
                                  daterange = 'dis',
                                  'Coef' = summary(lm(fit.xgb_log_hm_dis$pred$obs ~
                                                        fit.xgb_log_hm_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_hm_dis$pred$obs ~
                                                        fit.xgb_log_hm_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_hm_dis$trainingData), 
                       fit.xgb_log_hm_dis$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_log_hm_dis$trainingData), 
                       fit.xgb_log_hm_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Disaster Only',
                              model_response_name = 'log_hm',
                              daterange = 'dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Exclude Disaster

fit.xgb_log_hm_excl_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_excl_dis.rds')
getTrainPerf(fit.xgb_log_hm_excl_dis)
fit.xgb_log_hm_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.8 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 150
## nfeatures : 43 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64     150         4 0.4  0.01                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_excl_dis,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_excl_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_excl_dis$pred$obs), 
                                                              pred = exp(fit.xgb_log_hm_excl_dis$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Exclude Disaster') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Exclude Disaster',
                                  model_response_name = 'log_hm',
                                  daterange = 'excl_dis',
                                  'Coef' = summary(lm(fit.xgb_log_hm_excl_dis$pred$obs ~
                                                        fit.xgb_log_hm_excl_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_hm_excl_dis$pred$obs ~
                                                        fit.xgb_log_hm_excl_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_hm_excl_dis$trainingData), 
                       fit.xgb_log_hm_excl_dis$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_log_hm_excl_dis$trainingData), 
                       fit.xgb_log_hm_excl_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Exclude Disaster',
                              model_response_name = 'log_hm',
                              daterange = 'excl_dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Everything w. Disaster Indicator

fit.xgb_log_hm_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_dis_ind.rds')
getTrainPerf(fit.xgb_log_hm_dis_ind)
fit.xgb_log_hm_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.3 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44 
## niter: 150
## nfeatures : 44 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 56     150         4 0.4 0.001                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_dis_ind,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_log_hm_dis_ind$pred$obs, 
                           pred = fit.xgb_log_hm_dis_ind$pred$pred,
                           disaster = fit.xgb_log_hm_dis_ind$trainingData$disaster[fit.xgb_log_hm_dis_ind$pred$rowIndex])

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_dis_ind$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_dis_ind$pred$obs), 
                                                              pred = exp(fit.xgb_log_hm_dis_ind$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w. Disaster Indicator') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) + 
                        stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.08) + 
                        stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.17) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Everything w. Disaster Indicator',
                                  model_response_name = 'log_hm',
                                  daterange = 'dis_ind',
                                  'Coef' = summary(lm(fit.xgb_log_hm_dis_ind$pred$obs ~
                                                        fit.xgb_log_hm_dis_ind$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_hm_dis_ind$pred$obs ~
                                                        fit.xgb_log_hm_dis_ind$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                       test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_hm_dis_ind$trainingData), 
                       fit.xgb_log_hm_dis_ind$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_log_hm_dis_ind$trainingData), 
                       fit.xgb_log_hm_dis_ind$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w. Disaster Indicator',
                              model_response_name = 'log_hm',
                              daterange = 'dis_ind',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Everything w.o Disaster Indicator

fit.xgb_log_hm_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_full.rds')
getTrainPerf(fit.xgb_log_hm_full)
fit.xgb_log_hm_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 254 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 150
## nfeatures : 43 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64     150         4 0.4  0.01                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_full,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_log_hm_full$pred$obs, 
                           pred = fit.xgb_log_hm_full$pred$pred,
                           disaster = if_else(fit.xgb_log_hm_full$trainingData[fit.xgb_log_hm_full$pred$rowIndex, ]$year_2021 == 1 &
                                                (fit.xgb_log_hm_full$trainingData[fit.xgb_log_hm_full$pred$rowIndex, ]$month_10 == 1 |
                                                   fit.xgb_log_hm_full$trainingData[fit.xgb_log_hm_full$pred$rowIndex, ]$month_11 == 1 |
                                                   fit.xgb_log_hm_full$trainingData[fit.xgb_log_hm_full$pred$rowIndex, ]$month_12 == 1), 1, 0))

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_full$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_full_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_full$pred$obs), 
                                                              pred = exp(fit.xgb_log_hm_full$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w.o Disaster Indicator') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) + 
                        stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 
                        stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  model_response_name = 'log_hm',
                                  daterange = 'full',
                                  'Coef' = summary(lm(fit.xgb_log_hm_full$pred$obs ~
                                                        fit.xgb_log_hm_full$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_hm_full$pred$obs ~
                                                        fit.xgb_log_hm_full$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                       test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_hm_full$trainingData), 
                       fit.xgb_log_hm_full$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt, 
                       nrow(fit.xgb_log_hm_full$trainingData), 
                       fit.xgb_log_hm_full$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w.o Disaster Indicator',
                              model_response_name = 'log_hm',
                              daterange = 'full',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

XGB Model performances

validation_result
xgb_result

GAM VS XGBoost

base_validation_result <- validation_result %>%
  mutate(transformation = if_else(str_detect(model_response_name, 'log_'), 'Log', '')) %>%
  filter(transformation == '') %>%
  mutate(response_base = case_when(str_detect(model_response_name, 'da') ~ 'Daily Avg',
                                   str_detect(model_response_name, 'dm') ~ 'Daily Max',
                                   str_detect(model_response_name, 'ha') ~ 'Hourly Avg',
                                   str_detect(model_response_name, 'hm') ~ 'Hourly Max')) %>%
  arrange(factor(Model, levels = unique(validation_result$Model)))

log_validation_result <- validation_result %>%
  mutate(transformation = if_else(str_detect(model_response_name, 'log_'), 'Log', '')) %>%
  filter(transformation == 'Log') %>%
  mutate(response_base = case_when(str_detect(model_response_name, 'da') ~ 'Daily Avg',
                                   str_detect(model_response_name, 'dm') ~ 'Daily Max',
                                   str_detect(model_response_name, 'ha') ~ 'Hourly Avg',
                                   str_detect(model_response_name, 'hm') ~ 'Hourly Max'))

base_xgb_result <- xgb_result %>%
  mutate(transformation = if_else(str_detect(model_response_name, 'log_'), 'Log', '')) %>%
  filter(transformation == '') %>%
  mutate(response_base = case_when(str_detect(model_response_name, 'da') ~ 'Daily Avg',
                                   str_detect(model_response_name, 'dm') ~ 'Daily Max',
                                   str_detect(model_response_name, 'ha') ~ 'Hourly Avg',
                                   str_detect(model_response_name, 'hm') ~ 'Hourly Max')) %>%
  select(-transformation, -`BT R-Sq`, -`BT RMSE`)

log_xgb_result <- xgb_result %>%
  mutate(transformation = if_else(str_detect(model_response_name, 'log_'), 'Log', '')) %>%
  filter(transformation == 'Log') %>%
  mutate(response_base = case_when(str_detect(model_response_name, 'da') ~ 'Daily Avg',
                                   str_detect(model_response_name, 'dm') ~ 'Daily Max',
                                   str_detect(model_response_name, 'ha') ~ 'Hourly Avg',
                                   str_detect(model_response_name, 'hm') ~ 'Hourly Max')) %>%
  select(-transformation)
# This is the result from regressing the observed on predicted
options(knitr.kable.NA = '')
validation_result_table <- base_validation_result %>% 
  left_join(log_validation_result, join_by(Model, response_base)) %>%
  select(all_of(c('response_base', 'Coef.x', 'R-Sq.x', 'Normal RMSE.x', 
                  'Disaster RMSE.x', 'Coef.y', 'R-Sq.y', 'Normal RMSE.y', 
                  'Disaster RMSE.y'))) %>%
  setNames(c('Response', 'Coef', 'R2', 'Normal RMSE', 'Disaster RMSE', 
             'Coef', 'R2', 'Normal RMSE', 'Disaster RMSE'))

validation_result_table %>%
  knitr::kable(format = 'pipe', digits = 3, table.attr = "style='width:100%;'") %>%
  pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4, 
                      "Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
  add_header_above(c(' ' = 1, 'No Transformation' = 4, 'Log-Transformation' = 4)) %>%
  kable_styling()
No Transformation
Log-Transformation
Response Coef R2 Normal RMSE Disaster RMSE Coef R2 Normal RMSE Disaster RMSE
Since Feb 2022
Daily Avg 1.012 0.759 1.019 0.814
Daily Max 0.743 0.207 0.992 0.614
Hourly Avg 1.011 0.533 1.028 0.698
Hourly Max 1.033 0.365 1.025 0.638
Disaster Only
Daily Avg 1.172 0.642 1.015 0.864
Daily Max 1.041 0.532 1.020 0.821
Hourly Avg 0.925 0.434 1.002 0.771
Hourly Max 0.894 0.485 1.000 0.750
Exclude Disaster
Daily Avg 1.004 0.673 1.025 0.792
Daily Max 0.793 0.137 0.984 0.608
Hourly Avg 1.033 0.403 1.047 0.653
Hourly Max 0.965 0.276 1.043 0.604
Everything w D.I
Daily Avg 1.063 0.970 0.477 15.828 1.009 0.977 0.121 0.135
Daily Max 0.995 0.997 1.263 36.075 1.008 0.961 0.202 0.205
Hourly Avg 1.013 0.940 1.686 32.007 1.043 0.680 0.496 0.669
Hourly Max 1.042 0.915 4.313 81.977 1.043 0.642 0.567 0.719
Everything w.o D.I
Daily Avg 1.049 0.989 0.137 9.919 1.011 0.975 0.121 0.198
Daily Max 0.990 0.994 5.095 50.451 1.010 0.958 0.212 0.197
Hourly Avg 1.030 0.958 2.041 26.420 1.045 0.679 0.496 0.674
Hourly Max 0.948 0.874 4.301 100.490 1.046 0.641 0.567 0.728
full_result_table_fordisp <- base_table %>%
  left_join(log_table %>% select(-n), join_by(date_names, response_base)) %>%
  left_join(base_xgb_result, join_by(date_names == Model, response_base)) %>%
  left_join(log_xgb_result, join_by(date_names == Model, response_base)) %>%
  select(-starts_with('model_response_name'), -starts_with('daterange'), -'date_names') %>%
  select(all_of(c('response_base', 'n', 'adjr2.x', 'p.x', 'adjr2.y', 'bt_adjr2', 'p.y', 'R-Sq.x', 'RMSE.x', 'R-Sq.y', 'RMSE.y', 'BT R-Sq', 'BT RMSE'))) %>%
  setNames(c('Response','N', 'R2', 'P', 'R2', 'BT R2', 'P',
             c('R2', 'RMSE'),  
             c('R2', 'RMSE', 'BT R2', 'BT RMSE')))
  

full_result_table_kable <- full_result_table_fordisp %>%
  knitr::kable(format = 'latex', digits = 2) %>%
  pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4, 
                      "Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
  add_header_above(c(' ' = 7, 'Test' = 2, 'Test' = 4)) %>%
  add_header_above(c(' ' = 2, 'No Transformation' = 2, 'Log-Transformation' = 3, 'No Transformation' = 2, 'Log-Transformation' = 4)) %>%
  add_header_above(c(' ' = 2, 'GAM' = 5, 'XGBoost' = 6))

writeLines(gam_result_table_kable, '../figures/gam_result_table.tex')

full_result_table_fordisp %>%
  knitr::kable(format = 'pipe', digits = 4, table.attr = "style='width:100%;'") %>%
  pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4, 
                      "Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
  add_header_above(c(' ' = 7, 'Test' = 2, 'Test' = 4)) %>%
  add_header_above(c(' ' = 2, 'No Transformation' = 2, 'Log-Transformation' = 3, 'No Transformation' = 2, 'Log-Transformation' = 4)) %>%
  add_header_above(c(' ' = 2, 'GAM' = 5, 'XGBoost' = 6)) %>%
  kable_styling()
GAM
XGBoost
No Transformation
Log-Transformation
No Transformation
Log-Transformation
Test
Test
Response N R2 P R2 BT R2 P R2 RMSE R2 RMSE BT R2 BT RMSE
Since Feb 2022
Daily Avg 6531 0.6175 117 0.6979 0.6132 116 0.7614 0.2478 0.8127 0.3059 0.7637 0.2478
Daily Max 6531 0.2186 116 0.5433 0.2190 115 0.2879 3.1244 0.6123 0.5722 0.3557 2.9330
Hourly Avg 153718 0.3515 123 0.5466 0.3400 122 0.5345 0.4892 0.6975 0.4493 0.5042 0.5114
Hourly Max 153718 0.1748 122 0.4954 0.1608 122 0.3769 1.0293 0.6383 0.5400 0.2907 1.1047
Disaster Only
Daily Avg 1273 0.4170 105 0.7458 0.5459 111 0.5257 40.7997 0.8532 0.4274 0.8037 30.9226
Daily Max 1273 0.4320 99 0.7246 0.4399 110 0.7240 360.8105 0.8146 0.6128 0.7604 296.5671
Hourly Avg 30242 0.2097 114 0.5830 0.2170 114 0.4512 96.7972 0.7705 0.6075 0.4515 97.9899
Hourly Max 30242 0.2229 114 0.5601 0.2360 114 0.5473 196.2375 0.7492 0.6631 0.4520 213.3409
Exclude Disaster
Daily Avg 14322 0.4586 120 0.5666 0.4598 121 0.7043 0.3188 0.7915 0.3326 0.6951 0.3258
Daily Max 14322 0.1231 115 0.4800 0.1072 120 0.2722 3.4251 0.6073 0.5725 0.3417 3.2564
Hourly Avg 337596 0.2481 124 0.4442 0.2499 125 0.4896 0.6074 0.6526 0.4940 0.4561 0.6354
Hourly Max 337596 0.1497 123 0.4112 0.1413 125 0.3360 1.1044 0.6043 0.5672 0.2918 1.1416
Everything w D.I
Daily Avg 15595 0.1137 115 0.5588 -0.0072 120 0.9689 3.0545 0.9775 0.1219 0.9812 1.6211
Daily Max 15595 0.1236 115 0.5080 -0.0073 120 0.9628 6.7763 0.9609 0.2016 0.9980 4.3671
Hourly Avg 367838 0.0486 126 0.4428 0.0023 125 0.9322 8.6180 0.6801 0.5121 0.4724 29.6837
Hourly Max 367838 0.0505 125 0.4177 0.0010 126 0.9223 21.4359 0.6415 0.5812 0.4736 62.7187
Everything w.o D.I
Daily Avg 15595 0.1108 114 0.5431 -0.0073 119 0.9968 1.1273 0.9747 0.1289 0.9073 4.5043
Daily Max 15595 0.1206 115 0.4907 -0.0074 120 0.9797 8.7597 0.9574 0.2098 0.9966 7.8280
Hourly Avg 367838 0.0475 125 0.4353 0.0011 124 0.9491 7.5139 0.6790 0.5131 0.4320 29.1288
Hourly Max 367838 0.0495 125 0.4095 0.0004 125 0.8837 25.5470 0.6408 0.5819 0.4550 63.2933